1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
19 ; SSE-LABEL: load_i16_stride3_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
23 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
24 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
27 ; SSE-NEXT: movd %xmm1, (%rsi)
28 ; SSE-NEXT: movd %xmm2, (%rdx)
29 ; SSE-NEXT: movd %xmm0, (%rcx)
32 ; AVX-LABEL: load_i16_stride3_vf2:
34 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
35 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
36 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
37 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
38 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
39 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
40 ; AVX-NEXT: vmovd %xmm1, (%rsi)
41 ; AVX-NEXT: vmovd %xmm2, (%rdx)
42 ; AVX-NEXT: vmovd %xmm0, (%rcx)
45 ; AVX2-LABEL: load_i16_stride3_vf2:
47 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
48 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
49 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
50 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
51 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
52 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
53 ; AVX2-NEXT: vmovd %xmm1, (%rsi)
54 ; AVX2-NEXT: vmovd %xmm2, (%rdx)
55 ; AVX2-NEXT: vmovd %xmm0, (%rcx)
58 ; AVX2-FP-LABEL: load_i16_stride3_vf2:
60 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
61 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
62 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
63 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
64 ; AVX2-FP-NEXT: vmovd %xmm1, (%rsi)
65 ; AVX2-FP-NEXT: vmovd %xmm2, (%rdx)
66 ; AVX2-FP-NEXT: vmovd %xmm0, (%rcx)
69 ; AVX2-FCP-LABEL: load_i16_stride3_vf2:
71 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
72 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
73 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
74 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
75 ; AVX2-FCP-NEXT: vmovd %xmm1, (%rsi)
76 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rdx)
77 ; AVX2-FCP-NEXT: vmovd %xmm0, (%rcx)
80 ; AVX512-LABEL: load_i16_stride3_vf2:
82 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
83 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
84 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
85 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
86 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
87 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
88 ; AVX512-NEXT: vmovd %xmm1, (%rsi)
89 ; AVX512-NEXT: vmovd %xmm2, (%rdx)
90 ; AVX512-NEXT: vmovd %xmm0, (%rcx)
93 ; AVX512-FCP-LABEL: load_i16_stride3_vf2:
94 ; AVX512-FCP: # %bb.0:
95 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
96 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
97 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
98 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
99 ; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi)
100 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rdx)
101 ; AVX512-FCP-NEXT: vmovd %xmm0, (%rcx)
102 ; AVX512-FCP-NEXT: retq
104 ; AVX512DQ-LABEL: load_i16_stride3_vf2:
106 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
107 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
108 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
109 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
110 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
111 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
112 ; AVX512DQ-NEXT: vmovd %xmm1, (%rsi)
113 ; AVX512DQ-NEXT: vmovd %xmm2, (%rdx)
114 ; AVX512DQ-NEXT: vmovd %xmm0, (%rcx)
115 ; AVX512DQ-NEXT: retq
117 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf2:
118 ; AVX512DQ-FCP: # %bb.0:
119 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
120 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
121 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
122 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
123 ; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi)
124 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rdx)
125 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rcx)
126 ; AVX512DQ-FCP-NEXT: retq
128 ; AVX512BW-LABEL: load_i16_stride3_vf2:
130 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
131 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
132 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
133 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
134 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
135 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
136 ; AVX512BW-NEXT: vmovd %xmm1, (%rsi)
137 ; AVX512BW-NEXT: vmovd %xmm2, (%rdx)
138 ; AVX512BW-NEXT: vmovd %xmm0, (%rcx)
139 ; AVX512BW-NEXT: retq
141 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf2:
142 ; AVX512BW-FCP: # %bb.0:
143 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
144 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
145 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
146 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
147 ; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rsi)
148 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rdx)
149 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rcx)
150 ; AVX512BW-FCP-NEXT: retq
152 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf2:
153 ; AVX512DQ-BW: # %bb.0:
154 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
155 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
156 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
157 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
158 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
159 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
160 ; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rsi)
161 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rdx)
162 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rcx)
163 ; AVX512DQ-BW-NEXT: retq
165 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf2:
166 ; AVX512DQ-BW-FCP: # %bb.0:
167 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
168 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
169 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,3,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
170 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
171 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rsi)
172 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rdx)
173 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rcx)
174 ; AVX512DQ-BW-FCP-NEXT: retq
175 %wide.vec = load <6 x i16>, ptr %in.vec, align 64
176 %strided.vec0 = shufflevector <6 x i16> %wide.vec, <6 x i16> poison, <2 x i32> <i32 0, i32 3>
177 %strided.vec1 = shufflevector <6 x i16> %wide.vec, <6 x i16> poison, <2 x i32> <i32 1, i32 4>
178 %strided.vec2 = shufflevector <6 x i16> %wide.vec, <6 x i16> poison, <2 x i32> <i32 2, i32 5>
179 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
180 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
181 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
185 define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
186 ; SSE-LABEL: load_i16_stride3_vf4:
188 ; SSE-NEXT: movdqa (%rdi), %xmm0
189 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
190 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
191 ; SSE-NEXT: movdqa %xmm0, %xmm3
192 ; SSE-NEXT: pand %xmm2, %xmm3
193 ; SSE-NEXT: pandn %xmm1, %xmm2
194 ; SSE-NEXT: por %xmm3, %xmm2
195 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
196 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
197 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
198 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
199 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,3,2,3,4,5,6,7]
200 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
201 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
202 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
203 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
204 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
205 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
206 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
207 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
208 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
209 ; SSE-NEXT: movq %xmm2, (%rsi)
210 ; SSE-NEXT: movq %xmm1, (%rdx)
211 ; SSE-NEXT: movq %xmm0, (%rcx)
214 ; AVX-LABEL: load_i16_stride3_vf4:
216 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
217 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
218 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
219 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
220 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
221 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
222 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
223 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
224 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
225 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
226 ; AVX-NEXT: vmovq %xmm2, (%rsi)
227 ; AVX-NEXT: vmovq %xmm3, (%rdx)
228 ; AVX-NEXT: vmovq %xmm0, (%rcx)
231 ; AVX2-LABEL: load_i16_stride3_vf4:
233 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
234 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
235 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
236 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
237 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
238 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
239 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
240 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
241 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
242 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
243 ; AVX2-NEXT: vmovq %xmm2, (%rsi)
244 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
245 ; AVX2-NEXT: vmovq %xmm0, (%rcx)
248 ; AVX2-FP-LABEL: load_i16_stride3_vf4:
250 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
251 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
252 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
253 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
254 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
255 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
256 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
257 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
258 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
259 ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
260 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
261 ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
264 ; AVX2-FCP-LABEL: load_i16_stride3_vf4:
266 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
267 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
268 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
269 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
270 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
271 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
272 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
273 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
274 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
275 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
276 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
277 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
278 ; AVX2-FCP-NEXT: retq
280 ; AVX512-LABEL: load_i16_stride3_vf4:
282 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
283 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
284 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
285 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
286 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
287 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
288 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
289 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
290 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
291 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
292 ; AVX512-NEXT: vmovq %xmm2, (%rsi)
293 ; AVX512-NEXT: vmovq %xmm3, (%rdx)
294 ; AVX512-NEXT: vmovq %xmm0, (%rcx)
297 ; AVX512-FCP-LABEL: load_i16_stride3_vf4:
298 ; AVX512-FCP: # %bb.0:
299 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
300 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
301 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
302 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
303 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
304 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
305 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
306 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
307 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
308 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
309 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
310 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
311 ; AVX512-FCP-NEXT: retq
313 ; AVX512DQ-LABEL: load_i16_stride3_vf4:
315 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
316 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
317 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
318 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
319 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
320 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
321 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
322 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
323 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
324 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
325 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
326 ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
327 ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
328 ; AVX512DQ-NEXT: retq
330 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf4:
331 ; AVX512DQ-FCP: # %bb.0:
332 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
333 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
334 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
335 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,12,13,2,3,u,u,u,u,u,u,u,u]
336 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
337 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,u,u,u,u,u,u,u,u]
338 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
339 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
340 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
341 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
342 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
343 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
344 ; AVX512DQ-FCP-NEXT: retq
346 ; AVX512BW-LABEL: load_i16_stride3_vf4:
348 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
349 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
350 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
351 ; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
352 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
353 ; AVX512BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
354 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
355 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
356 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
357 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
358 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
359 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
360 ; AVX512BW-NEXT: vmovq %xmm1, (%rcx)
361 ; AVX512BW-NEXT: retq
363 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf4:
364 ; AVX512BW-FCP: # %bb.0:
365 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
366 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
367 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
368 ; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
369 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
370 ; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
371 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
372 ; AVX512BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4
373 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
374 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
375 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
376 ; AVX512BW-FCP-NEXT: retq
378 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf4:
379 ; AVX512DQ-BW: # %bb.0:
380 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
381 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
382 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2
383 ; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
384 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
385 ; AVX512DQ-BW-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
386 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
387 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
388 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
389 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
390 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
391 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
392 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rcx)
393 ; AVX512DQ-BW-NEXT: retq
395 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf4:
396 ; AVX512DQ-BW-FCP: # %bb.0:
397 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
398 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
399 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
400 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm0
401 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
402 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm3
403 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
404 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm2, %xmm1, %xmm4
405 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
406 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
407 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
408 ; AVX512DQ-BW-FCP-NEXT: retq
409 %wide.vec = load <12 x i16>, ptr %in.vec, align 64
410 %strided.vec0 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
411 %strided.vec1 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
412 %strided.vec2 = shufflevector <12 x i16> %wide.vec, <12 x i16> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
413 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
414 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
415 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
419 define void @load_i16_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
420 ; SSE-LABEL: load_i16_stride3_vf8:
422 ; SSE-NEXT: movdqa (%rdi), %xmm3
423 ; SSE-NEXT: movdqa 16(%rdi), %xmm2
424 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
425 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
426 ; SSE-NEXT: movdqa %xmm3, %xmm4
427 ; SSE-NEXT: pand %xmm1, %xmm4
428 ; SSE-NEXT: pandn %xmm2, %xmm1
429 ; SSE-NEXT: por %xmm4, %xmm1
430 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
431 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
432 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
433 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
434 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,7,6,7]
435 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
436 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5]
437 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0]
438 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0]
439 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
440 ; SSE-NEXT: movdqa %xmm4, %xmm5
441 ; SSE-NEXT: pandn %xmm2, %xmm5
442 ; SSE-NEXT: movdqa %xmm3, %xmm6
443 ; SSE-NEXT: pand %xmm4, %xmm6
444 ; SSE-NEXT: por %xmm5, %xmm6
445 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,2,3,4,5,6,7]
446 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
447 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
448 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
449 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
450 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
451 ; SSE-NEXT: pand %xmm6, %xmm5
452 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm0[0,3,2,3,4,5,6,7]
453 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
454 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
455 ; SSE-NEXT: movdqa %xmm6, %xmm8
456 ; SSE-NEXT: pandn %xmm7, %xmm8
457 ; SSE-NEXT: por %xmm5, %xmm8
458 ; SSE-NEXT: pand %xmm4, %xmm2
459 ; SSE-NEXT: pandn %xmm3, %xmm4
460 ; SSE-NEXT: por %xmm2, %xmm4
461 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
462 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
463 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
464 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
465 ; SSE-NEXT: pand %xmm6, %xmm2
466 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
467 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
468 ; SSE-NEXT: pandn %xmm0, %xmm6
469 ; SSE-NEXT: por %xmm2, %xmm6
470 ; SSE-NEXT: movaps %xmm1, (%rsi)
471 ; SSE-NEXT: movdqa %xmm8, (%rdx)
472 ; SSE-NEXT: movdqa %xmm6, (%rcx)
475 ; AVX-LABEL: load_i16_stride3_vf8:
477 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
478 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
479 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
480 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
481 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
482 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
483 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
484 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
485 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
486 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
487 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
488 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
489 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
490 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
491 ; AVX-NEXT: vmovdqa %xmm3, (%rsi)
492 ; AVX-NEXT: vmovdqa %xmm4, (%rdx)
493 ; AVX-NEXT: vmovdqa %xmm0, (%rcx)
496 ; AVX2-LABEL: load_i16_stride3_vf8:
498 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
499 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
500 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
501 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
502 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
503 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
504 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
505 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
506 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
507 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
508 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
509 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
510 ; AVX2-NEXT: vmovdqa %xmm3, (%rsi)
511 ; AVX2-NEXT: vmovdqa %xmm4, (%rdx)
512 ; AVX2-NEXT: vmovdqa %xmm0, (%rcx)
515 ; AVX2-FP-LABEL: load_i16_stride3_vf8:
517 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
518 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
519 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
520 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
521 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
522 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
523 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
524 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
525 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
526 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
527 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
528 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
529 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi)
530 ; AVX2-FP-NEXT: vmovdqa %xmm4, (%rdx)
531 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rcx)
534 ; AVX2-FCP-LABEL: load_i16_stride3_vf8:
536 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
537 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
538 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
539 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
540 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
541 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
542 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
543 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
544 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
545 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
546 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
547 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
548 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi)
549 ; AVX2-FCP-NEXT: vmovdqa %xmm4, (%rdx)
550 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rcx)
551 ; AVX2-FCP-NEXT: retq
553 ; AVX512-LABEL: load_i16_stride3_vf8:
555 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
556 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
557 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
558 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
559 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
560 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
561 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
562 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
563 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
564 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
565 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
566 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
567 ; AVX512-NEXT: vmovdqa %xmm3, (%rsi)
568 ; AVX512-NEXT: vmovdqa %xmm4, (%rdx)
569 ; AVX512-NEXT: vmovdqa %xmm0, (%rcx)
572 ; AVX512-FCP-LABEL: load_i16_stride3_vf8:
573 ; AVX512-FCP: # %bb.0:
574 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
575 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
576 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
577 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
578 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
579 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
580 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
581 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
582 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
583 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
584 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
585 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
586 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi)
587 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rdx)
588 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rcx)
589 ; AVX512-FCP-NEXT: retq
591 ; AVX512DQ-LABEL: load_i16_stride3_vf8:
593 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
594 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
595 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
596 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
597 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
598 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
599 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
600 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
601 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
602 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
603 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
604 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
605 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi)
606 ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rdx)
607 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rcx)
608 ; AVX512DQ-NEXT: retq
610 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf8:
611 ; AVX512DQ-FCP: # %bb.0:
612 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
613 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
614 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
615 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
616 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
617 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
618 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
619 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm1[2],xmm4[3,4],xmm1[5],xmm4[6,7]
620 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
621 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
622 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
623 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
624 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi)
625 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rdx)
626 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rcx)
627 ; AVX512DQ-FCP-NEXT: retq
629 ; AVX512BW-LABEL: load_i16_stride3_vf8:
631 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
632 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
633 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
634 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
635 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
636 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
637 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
638 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
639 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
640 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
641 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
642 ; AVX512BW-NEXT: vzeroupper
643 ; AVX512BW-NEXT: retq
645 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf8:
646 ; AVX512BW-FCP: # %bb.0:
647 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
648 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
649 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
650 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
651 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
652 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
653 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
654 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
655 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
656 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
657 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
658 ; AVX512BW-FCP-NEXT: vzeroupper
659 ; AVX512BW-FCP-NEXT: retq
661 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf8:
662 ; AVX512DQ-BW: # %bb.0:
663 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
664 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
665 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
666 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
667 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
668 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
669 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
670 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
671 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
672 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
673 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
674 ; AVX512DQ-BW-NEXT: vzeroupper
675 ; AVX512DQ-BW-NEXT: retq
677 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf8:
678 ; AVX512DQ-BW-FCP: # %bb.0:
679 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,3,6,9,12,15,18,21]
680 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
681 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
682 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
683 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,4,7,10,13,16,19,22]
684 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
685 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,5,8,11,14,17,20,23]
686 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
687 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
688 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
689 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
690 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
691 ; AVX512DQ-BW-FCP-NEXT: retq
692 %wide.vec = load <24 x i16>, ptr %in.vec, align 64
693 %strided.vec0 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
694 %strided.vec1 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
695 %strided.vec2 = shufflevector <24 x i16> %wide.vec, <24 x i16> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
696 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
697 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
698 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
702 define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
703 ; SSE-LABEL: load_i16_stride3_vf16:
705 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
706 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
707 ; SSE-NEXT: movdqa (%rdi), %xmm7
708 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
709 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
710 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
711 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
712 ; SSE-NEXT: movdqa %xmm6, %xmm8
713 ; SSE-NEXT: pandn %xmm4, %xmm8
714 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
715 ; SSE-NEXT: movdqa %xmm7, %xmm9
716 ; SSE-NEXT: movdqa %xmm5, %xmm10
717 ; SSE-NEXT: pandn %xmm7, %xmm10
718 ; SSE-NEXT: pand %xmm6, %xmm7
719 ; SSE-NEXT: por %xmm8, %xmm7
720 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
721 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
722 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
723 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
724 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm4[0,1,2,3,4,7,6,7]
725 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,1,2,1]
726 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5]
727 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0]
728 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm11[2,0]
729 ; SSE-NEXT: movdqa %xmm2, %xmm8
730 ; SSE-NEXT: pand %xmm6, %xmm8
731 ; SSE-NEXT: pandn %xmm1, %xmm6
732 ; SSE-NEXT: por %xmm8, %xmm6
733 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
734 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
735 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
736 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,1,4,5,6,7]
737 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm1[0,1,2,3,4,7,6,7]
738 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,1,2,1]
739 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,5]
740 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,0],xmm8[2,0]
741 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm11[2,0]
742 ; SSE-NEXT: movdqa %xmm5, %xmm8
743 ; SSE-NEXT: pandn %xmm4, %xmm8
744 ; SSE-NEXT: pand %xmm5, %xmm9
745 ; SSE-NEXT: por %xmm8, %xmm9
746 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm9[2,1,2,3,4,5,6,7]
747 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
748 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
749 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,3,0,4,5,6,7]
750 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm8[0,1,2,3,5,5,5,5]
751 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
752 ; SSE-NEXT: pand %xmm8, %xmm11
753 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[0,3,2,3,4,5,6,7]
754 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
755 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,4,4,5,6]
756 ; SSE-NEXT: movdqa %xmm8, %xmm9
757 ; SSE-NEXT: pandn %xmm12, %xmm9
758 ; SSE-NEXT: por %xmm11, %xmm9
759 ; SSE-NEXT: movdqa %xmm5, %xmm11
760 ; SSE-NEXT: pandn %xmm1, %xmm11
761 ; SSE-NEXT: movdqa %xmm2, %xmm12
762 ; SSE-NEXT: pand %xmm5, %xmm12
763 ; SSE-NEXT: por %xmm11, %xmm12
764 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[2,1,2,3,4,5,6,7]
765 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
766 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,3,2,3]
767 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,0,4,5,6,7]
768 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,5,5,5,5]
769 ; SSE-NEXT: pand %xmm8, %xmm12
770 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm0[0,3,2,3,4,5,6,7]
771 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
772 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,4,4,5,6]
773 ; SSE-NEXT: movdqa %xmm8, %xmm11
774 ; SSE-NEXT: pandn %xmm13, %xmm11
775 ; SSE-NEXT: por %xmm12, %xmm11
776 ; SSE-NEXT: pand %xmm5, %xmm4
777 ; SSE-NEXT: por %xmm10, %xmm4
778 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
779 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
780 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
781 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7]
782 ; SSE-NEXT: pand %xmm8, %xmm4
783 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
784 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
785 ; SSE-NEXT: movdqa %xmm8, %xmm10
786 ; SSE-NEXT: pandn %xmm3, %xmm10
787 ; SSE-NEXT: por %xmm4, %xmm10
788 ; SSE-NEXT: pand %xmm5, %xmm1
789 ; SSE-NEXT: pandn %xmm2, %xmm5
790 ; SSE-NEXT: por %xmm1, %xmm5
791 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,1,2,0]
792 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
793 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
794 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
795 ; SSE-NEXT: pand %xmm8, %xmm1
796 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
797 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
798 ; SSE-NEXT: pandn %xmm0, %xmm8
799 ; SSE-NEXT: por %xmm1, %xmm8
800 ; SSE-NEXT: movaps %xmm6, 16(%rsi)
801 ; SSE-NEXT: movaps %xmm7, (%rsi)
802 ; SSE-NEXT: movdqa %xmm11, 16(%rdx)
803 ; SSE-NEXT: movdqa %xmm9, (%rdx)
804 ; SSE-NEXT: movdqa %xmm8, 16(%rcx)
805 ; SSE-NEXT: movdqa %xmm10, (%rcx)
808 ; AVX-LABEL: load_i16_stride3_vf16:
810 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0
811 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
812 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
813 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
814 ; AVX-NEXT: vmovdqa (%rdi), %xmm3
815 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm4
816 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm5
817 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm6
818 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7]
819 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
820 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3,4,5,6,7]
821 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,1,2,1]
822 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
823 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
824 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
825 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5],xmm7[6,7]
826 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2
827 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
828 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
829 ; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
830 ; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm7
831 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
832 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0],xmm9[1,2],xmm5[3],xmm9[4,5],xmm5[6],xmm9[7]
833 ; AVX-NEXT: vpshufb %xmm8, %xmm9, %xmm8
834 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
835 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
836 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
837 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,1,2,3]
838 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
839 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
840 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
841 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7]
842 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
843 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
844 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
845 ; AVX-NEXT: vmovaps %ymm7, (%rdx)
846 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
847 ; AVX-NEXT: vzeroupper
850 ; AVX2-LABEL: load_i16_stride3_vf16:
852 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
853 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
854 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
855 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
856 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
857 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
858 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
859 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm3
860 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
861 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
862 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
863 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
864 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
865 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
866 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
867 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
868 ; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
869 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
870 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
871 ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
872 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
873 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
874 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
875 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
876 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
877 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
878 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
879 ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
880 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
881 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
882 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
883 ; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
884 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
885 ; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1
886 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
887 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
888 ; AVX2-NEXT: vmovdqa %ymm2, (%rsi)
889 ; AVX2-NEXT: vmovdqa %ymm5, (%rdx)
890 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
891 ; AVX2-NEXT: vzeroupper
894 ; AVX2-FP-LABEL: load_i16_stride3_vf16:
896 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
897 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
898 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
899 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
900 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
901 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
902 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
903 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3
904 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4
905 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
906 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
907 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
908 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
909 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
910 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
911 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
912 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
913 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
914 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
915 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
916 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
917 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
918 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
919 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
920 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
921 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
922 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
923 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
924 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
925 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
926 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
927 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
928 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
929 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
930 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
931 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
932 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi)
933 ; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx)
934 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
935 ; AVX2-FP-NEXT: vzeroupper
938 ; AVX2-FCP-LABEL: load_i16_stride3_vf16:
940 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
941 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
942 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
943 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
944 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
945 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
946 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
947 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
948 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
949 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
950 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
951 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
952 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
953 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
954 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
955 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
956 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
957 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
958 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
959 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
960 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
961 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
962 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
963 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
964 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
965 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
966 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
967 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
968 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
969 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
970 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
971 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
972 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
973 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
974 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
975 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
976 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi)
977 ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx)
978 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
979 ; AVX2-FCP-NEXT: vzeroupper
980 ; AVX2-FCP-NEXT: retq
982 ; AVX512-LABEL: load_i16_stride3_vf16:
984 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
985 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2
986 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
987 ; AVX512-NEXT: vmovdqa %ymm0, %ymm3
988 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3
989 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
990 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
991 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
992 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
993 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
994 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
995 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
996 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
997 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
998 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
999 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
1000 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1001 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
1002 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
1003 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
1004 ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1005 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1006 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
1007 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1008 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
1009 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
1010 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
1011 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
1012 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
1013 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1014 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
1015 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1016 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
1017 ; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1018 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1019 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1020 ; AVX512-NEXT: vmovdqa %ymm3, (%rsi)
1021 ; AVX512-NEXT: vmovdqa %ymm6, (%rdx)
1022 ; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
1023 ; AVX512-NEXT: vzeroupper
1026 ; AVX512-FCP-LABEL: load_i16_stride3_vf16:
1027 ; AVX512-FCP: # %bb.0:
1028 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1029 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
1030 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1031 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
1032 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3
1033 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1034 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1035 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1036 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1037 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
1038 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
1039 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1040 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1041 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
1042 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1043 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
1044 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1045 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
1046 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
1047 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
1048 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1049 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1050 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
1051 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1052 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
1053 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
1054 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
1055 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
1056 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1057 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1058 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
1059 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1060 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
1061 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1062 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1063 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1064 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
1065 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx)
1066 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1067 ; AVX512-FCP-NEXT: vzeroupper
1068 ; AVX512-FCP-NEXT: retq
1070 ; AVX512DQ-LABEL: load_i16_stride3_vf16:
1071 ; AVX512DQ: # %bb.0:
1072 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1073 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2
1074 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1075 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
1076 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3
1077 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1078 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1079 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1080 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4
1081 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
1082 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
1083 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1084 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1085 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
1086 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1087 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
1088 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1089 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
1090 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
1091 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
1092 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1093 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1094 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
1095 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1096 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
1097 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
1098 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
1099 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
1100 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
1101 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1102 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
1103 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1104 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
1105 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1106 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1107 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1108 ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rsi)
1109 ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rdx)
1110 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
1111 ; AVX512DQ-NEXT: vzeroupper
1112 ; AVX512DQ-NEXT: retq
1114 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf16:
1115 ; AVX512DQ-FCP: # %bb.0:
1116 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1117 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
1118 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1119 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
1120 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm3
1121 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1122 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1123 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1124 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1125 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
1126 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
1127 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1128 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1129 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
1130 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1131 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
1132 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1133 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
1134 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
1135 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
1136 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1137 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
1138 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
1139 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1140 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
1141 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
1142 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
1143 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
1144 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1145 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1146 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
1147 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
1148 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
1149 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1150 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1151 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1152 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
1153 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx)
1154 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1155 ; AVX512DQ-FCP-NEXT: vzeroupper
1156 ; AVX512DQ-FCP-NEXT: retq
1158 ; AVX512BW-LABEL: load_i16_stride3_vf16:
1159 ; AVX512BW: # %bb.0:
1160 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1161 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1162 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45]
1163 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1164 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46]
1165 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1166 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47]
1167 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1168 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi)
1169 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
1170 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
1171 ; AVX512BW-NEXT: vzeroupper
1172 ; AVX512BW-NEXT: retq
1174 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf16:
1175 ; AVX512BW-FCP: # %bb.0:
1176 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1177 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1178 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45]
1179 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1180 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46]
1181 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1182 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47]
1183 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1184 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
1185 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1186 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1187 ; AVX512BW-FCP-NEXT: vzeroupper
1188 ; AVX512BW-FCP-NEXT: retq
1190 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf16:
1191 ; AVX512DQ-BW: # %bb.0:
1192 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1193 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1194 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45]
1195 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1196 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46]
1197 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1198 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47]
1199 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1200 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi)
1201 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
1202 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
1203 ; AVX512DQ-BW-NEXT: vzeroupper
1204 ; AVX512DQ-BW-NEXT: retq
1206 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf16:
1207 ; AVX512DQ-BW-FCP: # %bb.0:
1208 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1209 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1210 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45]
1211 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1212 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46]
1213 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1214 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23,26,29,32,35,38,41,44,47]
1215 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1216 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
1217 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1218 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1219 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1220 ; AVX512DQ-BW-FCP-NEXT: retq
1221 %wide.vec = load <48 x i16>, ptr %in.vec, align 64
1222 %strided.vec0 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1223 %strided.vec1 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1224 %strided.vec2 = shufflevector <48 x i16> %wide.vec, <48 x i16> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1225 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
1226 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
1227 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
1231 define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
1232 ; SSE-LABEL: load_i16_stride3_vf32:
1234 ; SSE-NEXT: subq $40, %rsp
1235 ; SSE-NEXT: movdqa 96(%rdi), %xmm5
1236 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1237 ; SSE-NEXT: movdqa 176(%rdi), %xmm6
1238 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1239 ; SSE-NEXT: movdqa 144(%rdi), %xmm13
1240 ; SSE-NEXT: movdqa 160(%rdi), %xmm9
1241 ; SSE-NEXT: movdqa 80(%rdi), %xmm11
1242 ; SSE-NEXT: movdqa (%rdi), %xmm15
1243 ; SSE-NEXT: movdqa 16(%rdi), %xmm10
1244 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
1245 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1246 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
1247 ; SSE-NEXT: movdqa 64(%rdi), %xmm12
1248 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
1249 ; SSE-NEXT: movdqa %xmm1, %xmm2
1250 ; SSE-NEXT: pandn %xmm12, %xmm2
1251 ; SSE-NEXT: movdqa %xmm0, %xmm3
1252 ; SSE-NEXT: pand %xmm1, %xmm3
1253 ; SSE-NEXT: por %xmm2, %xmm3
1254 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
1255 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1256 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
1257 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
1258 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,7,6,7]
1259 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1]
1260 ; SSE-NEXT: movdqa %xmm11, %xmm8
1261 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1262 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
1263 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
1264 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
1265 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1266 ; SSE-NEXT: movdqa %xmm1, %xmm2
1267 ; SSE-NEXT: pandn %xmm10, %xmm2
1268 ; SSE-NEXT: movdqa %xmm15, %xmm3
1269 ; SSE-NEXT: pand %xmm1, %xmm3
1270 ; SSE-NEXT: por %xmm2, %xmm3
1271 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
1272 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1273 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
1274 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
1275 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,7,6,7]
1276 ; SSE-NEXT: movdqa %xmm10, %xmm11
1277 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1278 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,2,1]
1279 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
1280 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
1281 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
1282 ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
1283 ; SSE-NEXT: movdqa %xmm1, %xmm2
1284 ; SSE-NEXT: movdqa %xmm9, %xmm7
1285 ; SSE-NEXT: pandn %xmm9, %xmm2
1286 ; SSE-NEXT: movdqa %xmm13, %xmm3
1287 ; SSE-NEXT: pand %xmm1, %xmm3
1288 ; SSE-NEXT: por %xmm2, %xmm3
1289 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7]
1290 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,2,1]
1291 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
1292 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
1293 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
1294 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
1295 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1296 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
1297 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
1298 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
1299 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1300 ; SSE-NEXT: movdqa %xmm5, %xmm2
1301 ; SSE-NEXT: pand %xmm1, %xmm2
1302 ; SSE-NEXT: pandn %xmm6, %xmm1
1303 ; SSE-NEXT: por %xmm2, %xmm1
1304 ; SSE-NEXT: movdqa 128(%rdi), %xmm5
1305 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,2,1]
1306 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
1307 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,7,6,7]
1308 ; SSE-NEXT: movdqa %xmm6, %xmm9
1309 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1310 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0]
1311 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1312 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1313 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1314 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
1315 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
1316 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1317 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,0,65535,65535,0,65535,65535]
1318 ; SSE-NEXT: movdqa %xmm14, %xmm4
1319 ; SSE-NEXT: pandn %xmm0, %xmm4
1320 ; SSE-NEXT: movdqa %xmm0, %xmm1
1321 ; SSE-NEXT: movdqa %xmm14, %xmm0
1322 ; SSE-NEXT: pandn %xmm12, %xmm0
1323 ; SSE-NEXT: pand %xmm14, %xmm1
1324 ; SSE-NEXT: por %xmm0, %xmm1
1325 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7]
1326 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1327 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6]
1328 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
1329 ; SSE-NEXT: movdqa %xmm0, %xmm10
1330 ; SSE-NEXT: pandn %xmm2, %xmm10
1331 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1332 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1333 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1334 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1335 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1336 ; SSE-NEXT: pand %xmm0, %xmm1
1337 ; SSE-NEXT: por %xmm1, %xmm10
1338 ; SSE-NEXT: movdqa %xmm14, %xmm3
1339 ; SSE-NEXT: pandn %xmm15, %xmm3
1340 ; SSE-NEXT: movdqa %xmm15, %xmm1
1341 ; SSE-NEXT: movdqa %xmm14, %xmm2
1342 ; SSE-NEXT: pandn %xmm11, %xmm2
1343 ; SSE-NEXT: pand %xmm14, %xmm1
1344 ; SSE-NEXT: por %xmm2, %xmm1
1345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1346 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,3,2,3,4,5,6,7]
1347 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
1348 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
1349 ; SSE-NEXT: movdqa %xmm0, %xmm15
1350 ; SSE-NEXT: pandn %xmm2, %xmm15
1351 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1352 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1354 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1355 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1356 ; SSE-NEXT: pand %xmm0, %xmm1
1357 ; SSE-NEXT: por %xmm1, %xmm15
1358 ; SSE-NEXT: movdqa %xmm14, %xmm6
1359 ; SSE-NEXT: pandn %xmm13, %xmm6
1360 ; SSE-NEXT: movdqa %xmm13, %xmm1
1361 ; SSE-NEXT: movdqa %xmm14, %xmm13
1362 ; SSE-NEXT: pandn %xmm7, %xmm13
1363 ; SSE-NEXT: pand %xmm14, %xmm1
1364 ; SSE-NEXT: por %xmm13, %xmm1
1365 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1366 ; SSE-NEXT: # xmm13 = mem[0,3,2,3,4,5,6,7]
1367 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3]
1368 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,5,6]
1369 ; SSE-NEXT: movdqa %xmm0, %xmm13
1370 ; SSE-NEXT: pandn %xmm2, %xmm13
1371 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1372 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1373 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1374 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1375 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1376 ; SSE-NEXT: pand %xmm0, %xmm1
1377 ; SSE-NEXT: por %xmm1, %xmm13
1378 ; SSE-NEXT: movdqa %xmm14, %xmm1
1379 ; SSE-NEXT: pandn %xmm9, %xmm1
1380 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1381 ; SSE-NEXT: movdqa %xmm8, %xmm2
1382 ; SSE-NEXT: pand %xmm14, %xmm2
1383 ; SSE-NEXT: por %xmm1, %xmm2
1384 ; SSE-NEXT: movdqa %xmm5, %xmm9
1385 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7]
1386 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1387 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
1388 ; SSE-NEXT: movdqa %xmm0, %xmm5
1389 ; SSE-NEXT: pandn %xmm1, %xmm5
1390 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[2,1,2,3,4,5,6,7]
1391 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1392 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1393 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1394 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1395 ; SSE-NEXT: pand %xmm0, %xmm1
1396 ; SSE-NEXT: por %xmm1, %xmm5
1397 ; SSE-NEXT: pand %xmm14, %xmm12
1398 ; SSE-NEXT: por %xmm4, %xmm12
1399 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1400 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7]
1401 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
1402 ; SSE-NEXT: movdqa %xmm0, %xmm4
1403 ; SSE-NEXT: pandn %xmm1, %xmm4
1404 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0]
1405 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1406 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1407 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
1408 ; SSE-NEXT: pand %xmm0, %xmm1
1409 ; SSE-NEXT: por %xmm1, %xmm4
1410 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1411 ; SSE-NEXT: pand %xmm14, %xmm1
1412 ; SSE-NEXT: por %xmm3, %xmm1
1413 ; SSE-NEXT: movdqa %xmm1, %xmm2
1414 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,7,6,7]
1415 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
1416 ; SSE-NEXT: movdqa %xmm0, %xmm3
1417 ; SSE-NEXT: pandn %xmm1, %xmm3
1418 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0]
1419 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1420 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1421 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
1422 ; SSE-NEXT: pand %xmm0, %xmm1
1423 ; SSE-NEXT: por %xmm1, %xmm3
1424 ; SSE-NEXT: pand %xmm14, %xmm7
1425 ; SSE-NEXT: por %xmm6, %xmm7
1426 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1427 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,4,7,6,7]
1428 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
1429 ; SSE-NEXT: movdqa %xmm0, %xmm2
1430 ; SSE-NEXT: pandn %xmm1, %xmm2
1431 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,0]
1432 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1433 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1434 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
1435 ; SSE-NEXT: pand %xmm0, %xmm1
1436 ; SSE-NEXT: por %xmm1, %xmm2
1437 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1438 ; SSE-NEXT: pand %xmm14, %xmm1
1439 ; SSE-NEXT: pandn %xmm8, %xmm14
1440 ; SSE-NEXT: por %xmm1, %xmm14
1441 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0]
1442 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1443 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1444 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
1445 ; SSE-NEXT: pand %xmm0, %xmm1
1446 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,7,6,7]
1447 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
1448 ; SSE-NEXT: pandn %xmm6, %xmm0
1449 ; SSE-NEXT: por %xmm1, %xmm0
1450 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1451 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
1452 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1453 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
1454 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
1455 ; SSE-NEXT: movaps %xmm1, (%rsi)
1456 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1457 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
1458 ; SSE-NEXT: movdqa %xmm5, 32(%rdx)
1459 ; SSE-NEXT: movdqa %xmm13, 48(%rdx)
1460 ; SSE-NEXT: movdqa %xmm15, (%rdx)
1461 ; SSE-NEXT: movdqa %xmm10, 16(%rdx)
1462 ; SSE-NEXT: movdqa %xmm0, 32(%rcx)
1463 ; SSE-NEXT: movdqa %xmm2, 48(%rcx)
1464 ; SSE-NEXT: movdqa %xmm3, (%rcx)
1465 ; SSE-NEXT: movdqa %xmm4, 16(%rcx)
1466 ; SSE-NEXT: addq $40, %rsp
1469 ; AVX-LABEL: load_i16_stride3_vf32:
1471 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm2
1472 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5
1473 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
1474 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
1475 ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0
1476 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
1477 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
1478 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
1479 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm6
1480 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,3,2,3,4,5,6,7]
1481 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
1482 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4,5,6,7]
1483 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
1484 ; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1485 ; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm7
1486 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
1487 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5]
1488 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
1489 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
1490 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1491 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm7
1492 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm8
1493 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
1494 ; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm10
1495 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm9
1496 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,3,2,3,4,5,6,7]
1497 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,3]
1498 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1,2],xmm10[3,4,5,6,7]
1499 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm10
1500 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
1501 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6],xmm10[7]
1502 ; AVX-NEXT: vpshufb %xmm12, %xmm14, %xmm14
1503 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm12
1504 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[0,1,2,1]
1505 ; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
1506 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
1507 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm0
1508 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1509 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
1510 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm6[1],xmm14[2,3],xmm6[4],xmm14[5,6],xmm6[7]
1511 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
1512 ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14
1513 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
1514 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7]
1515 ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
1516 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
1517 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
1518 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
1519 ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
1520 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
1521 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7]
1522 ; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm13
1523 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0
1524 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
1525 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
1526 ; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2
1527 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
1528 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
1529 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3,4,5,6,7]
1530 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
1531 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7]
1532 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
1533 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1534 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1535 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
1536 ; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2
1537 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[2,1,2,3]
1538 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
1539 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
1540 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6,7]
1541 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3],xmm12[4],xmm4[5,6],xmm12[7]
1542 ; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm3
1543 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
1544 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1545 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
1546 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1547 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
1548 ; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
1549 ; AVX-NEXT: vmovaps %ymm14, (%rdx)
1550 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
1551 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
1552 ; AVX-NEXT: vzeroupper
1555 ; AVX2-LABEL: load_i16_stride3_vf32:
1557 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1558 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1559 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2
1560 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4
1561 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1562 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
1563 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
1564 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
1565 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1566 ; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
1567 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
1568 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm6
1569 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
1570 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
1571 ; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9
1572 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1573 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
1574 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1575 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
1576 ; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
1577 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
1578 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
1579 ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1580 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7
1581 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
1582 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
1583 ; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm10
1584 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1585 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
1586 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
1587 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1588 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1589 ; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
1590 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
1591 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
1592 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1593 ; AVX2-NEXT: vpshufb %ymm12, %ymm10, %ymm10
1594 ; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
1595 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
1596 ; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1597 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1598 ; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
1599 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
1600 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1601 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
1602 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
1603 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
1604 ; AVX2-NEXT: vpshufb %ymm12, %ymm11, %ymm11
1605 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
1606 ; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm12
1607 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
1608 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
1609 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
1610 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
1611 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1612 ; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
1613 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
1614 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
1615 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1616 ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
1617 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
1618 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
1619 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1620 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1621 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
1622 ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
1623 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1624 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1625 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1626 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
1627 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1628 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1629 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1630 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi)
1631 ; AVX2-NEXT: vmovdqa %ymm9, (%rsi)
1632 ; AVX2-NEXT: vmovdqa %ymm10, 32(%rdx)
1633 ; AVX2-NEXT: vmovdqa %ymm11, (%rdx)
1634 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rcx)
1635 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
1636 ; AVX2-NEXT: vzeroupper
1639 ; AVX2-FP-LABEL: load_i16_stride3_vf32:
1641 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
1642 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
1643 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2
1644 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4
1645 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1646 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
1647 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
1648 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
1649 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1650 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
1651 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5
1652 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm6
1653 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
1654 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
1655 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
1656 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1657 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
1658 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1659 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
1660 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
1661 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
1662 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
1663 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1664 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm7
1665 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8
1666 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
1667 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
1668 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1669 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
1670 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
1671 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1672 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1673 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
1674 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
1675 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
1676 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1677 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm10, %ymm10
1678 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
1679 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
1680 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1681 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1682 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
1683 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
1684 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1685 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
1686 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
1687 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
1688 ; AVX2-FP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
1689 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
1690 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
1691 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
1692 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
1693 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
1694 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
1695 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1696 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
1697 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
1698 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
1699 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1700 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
1701 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
1702 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
1703 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1704 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1705 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
1706 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
1707 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1708 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1709 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1710 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
1711 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1712 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1713 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1714 ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rsi)
1715 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%rsi)
1716 ; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%rdx)
1717 ; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx)
1718 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rcx)
1719 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
1720 ; AVX2-FP-NEXT: vzeroupper
1721 ; AVX2-FP-NEXT: retq
1723 ; AVX2-FCP-LABEL: load_i16_stride3_vf32:
1724 ; AVX2-FCP: # %bb.0:
1725 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
1726 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1727 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1728 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
1729 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1730 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
1731 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
1732 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
1733 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1734 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
1735 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5
1736 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm6
1737 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
1738 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
1739 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
1740 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1741 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
1742 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1743 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
1744 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
1745 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
1746 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
1747 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
1748 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7
1749 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
1750 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
1751 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
1752 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1753 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
1754 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
1755 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
1756 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
1757 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
1758 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
1759 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
1760 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1761 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10
1762 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
1763 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
1764 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1765 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1766 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
1767 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
1768 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
1769 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
1770 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
1771 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
1772 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
1773 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
1774 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
1775 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
1776 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
1777 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
1778 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
1779 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
1780 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
1781 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
1782 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
1783 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1784 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
1785 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
1786 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
1787 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1788 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1789 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
1790 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
1791 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1792 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
1793 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
1794 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
1795 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1796 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1797 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
1798 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rsi)
1799 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rsi)
1800 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%rdx)
1801 ; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx)
1802 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rcx)
1803 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1804 ; AVX2-FCP-NEXT: vzeroupper
1805 ; AVX2-FCP-NEXT: retq
1807 ; AVX512-LABEL: load_i16_stride3_vf32:
1809 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1810 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5
1811 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6
1812 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1
1813 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
1814 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
1815 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
1816 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
1817 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
1818 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
1819 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
1820 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1821 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
1822 ; AVX512-NEXT: vmovdqa (%rdi), %ymm8
1823 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9
1824 ; AVX512-NEXT: vmovdqa %ymm0, %ymm3
1825 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
1826 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1827 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1828 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1829 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
1830 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
1831 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
1832 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1833 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1834 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
1835 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
1836 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
1837 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
1838 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10
1839 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
1840 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1841 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
1842 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
1843 ; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10
1844 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1845 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
1846 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1847 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
1848 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1849 ; AVX512-NEXT: vmovdqa %ymm12, %ymm13
1850 ; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13
1851 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
1852 ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
1853 ; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1854 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1855 ; AVX512-NEXT: vpshufb %xmm11, %xmm14, %xmm11
1856 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1857 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
1858 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
1859 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
1860 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1861 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12
1862 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
1863 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
1864 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1865 ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
1866 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1867 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1868 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
1869 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
1870 ; AVX512-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
1871 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
1872 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
1873 ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1874 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
1875 ; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1876 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1877 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1878 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1879 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi)
1880 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx)
1881 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx)
1882 ; AVX512-NEXT: vzeroupper
1885 ; AVX512-FCP-LABEL: load_i16_stride3_vf32:
1886 ; AVX512-FCP: # %bb.0:
1887 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1888 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
1889 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
1890 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
1891 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
1892 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
1893 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
1894 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
1895 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
1896 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
1897 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
1898 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1899 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
1900 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8
1901 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
1902 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
1903 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
1904 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1905 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1906 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1907 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
1908 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
1909 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
1910 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1911 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1912 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
1913 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
1914 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
1915 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
1916 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
1917 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
1918 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1919 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
1920 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
1921 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
1922 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
1923 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
1924 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1925 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
1926 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
1927 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm13
1928 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13
1929 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
1930 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
1931 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
1932 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
1933 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm11
1934 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1935 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
1936 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
1937 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
1938 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1939 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12
1940 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
1941 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
1942 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
1943 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
1944 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1945 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1946 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
1947 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
1948 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
1949 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
1950 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
1951 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
1952 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
1953 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1954 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1955 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
1956 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1957 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
1958 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
1959 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
1960 ; AVX512-FCP-NEXT: vzeroupper
1961 ; AVX512-FCP-NEXT: retq
1963 ; AVX512DQ-LABEL: load_i16_stride3_vf32:
1964 ; AVX512DQ: # %bb.0:
1965 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
1966 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
1967 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6
1968 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
1969 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
1970 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
1971 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
1972 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
1973 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
1974 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2
1975 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
1976 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1977 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
1978 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
1979 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9
1980 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
1981 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
1982 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
1983 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
1984 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
1985 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
1986 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
1987 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
1988 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
1989 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1990 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
1991 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
1992 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
1993 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
1994 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
1995 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
1996 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
1997 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
1998 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
1999 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10
2000 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
2001 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
2002 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
2003 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
2004 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2005 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm13
2006 ; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13
2007 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
2008 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
2009 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
2010 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
2011 ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm14, %xmm11
2012 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2013 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
2014 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
2015 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2016 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
2017 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12
2018 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
2019 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
2020 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
2021 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2022 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
2023 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
2024 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2025 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
2026 ; AVX512DQ-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
2027 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
2028 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
2029 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2030 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
2031 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2
2032 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2033 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
2034 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2035 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi)
2036 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx)
2037 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx)
2038 ; AVX512DQ-NEXT: vzeroupper
2039 ; AVX512DQ-NEXT: retq
2041 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32:
2042 ; AVX512DQ-FCP: # %bb.0:
2043 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2044 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
2045 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
2046 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
2047 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm1
2048 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
2049 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
2050 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
2051 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
2052 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
2053 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
2054 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
2055 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
2056 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8
2057 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
2058 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
2059 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm9, %ymm8, %ymm3
2060 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
2061 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
2062 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
2063 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
2064 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
2065 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2066 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
2067 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2068 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
2069 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
2070 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2071 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
2072 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
2073 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm6, %ymm5, %ymm10
2074 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2075 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
2076 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
2077 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
2078 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
2079 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
2080 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
2081 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
2082 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2083 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm13
2084 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm13
2085 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
2086 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
2087 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
2088 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
2089 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm11
2090 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2091 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
2092 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
2093 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2094 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
2095 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm6, %ymm12
2096 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
2097 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
2098 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
2099 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2100 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
2101 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
2102 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2103 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
2104 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm8, %ymm9, %ymm0
2105 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
2106 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
2107 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2108 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
2109 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
2110 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2111 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
2112 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2113 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
2114 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
2115 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
2116 ; AVX512DQ-FCP-NEXT: vzeroupper
2117 ; AVX512DQ-FCP-NEXT: retq
2119 ; AVX512BW-LABEL: load_i16_stride3_vf32:
2120 ; AVX512BW: # %bb.0:
2121 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
2122 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2123 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
2124 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
2125 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2126 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
2127 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4
2128 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
2129 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2130 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
2131 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5
2132 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
2133 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3
2134 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
2135 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
2136 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi)
2137 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
2138 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2139 ; AVX512BW-NEXT: vzeroupper
2140 ; AVX512BW-NEXT: retq
2142 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf32:
2143 ; AVX512BW-FCP: # %bb.0:
2144 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2145 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2146 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
2147 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
2148 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2149 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
2150 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4
2151 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
2152 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2153 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
2154 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5
2155 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
2156 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3
2157 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
2158 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
2159 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
2160 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
2161 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
2162 ; AVX512BW-FCP-NEXT: vzeroupper
2163 ; AVX512BW-FCP-NEXT: retq
2165 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf32:
2166 ; AVX512DQ-BW: # %bb.0:
2167 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
2168 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
2169 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
2170 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
2171 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2172 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
2173 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm4
2174 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
2175 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2176 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
2177 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5
2178 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
2179 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm3
2180 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
2181 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
2182 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi)
2183 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx)
2184 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rcx)
2185 ; AVX512DQ-BW-NEXT: vzeroupper
2186 ; AVX512DQ-BW-NEXT: retq
2188 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf32:
2189 ; AVX512DQ-BW-FCP: # %bb.0:
2190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
2193 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
2194 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2195 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
2196 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm4
2197 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
2198 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
2199 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
2200 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5
2201 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
2202 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm3
2203 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
2204 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm0
2205 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
2206 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
2207 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
2208 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2209 ; AVX512DQ-BW-FCP-NEXT: retq
2210 %wide.vec = load <96 x i16>, ptr %in.vec, align 64
2211 %strided.vec0 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
2212 %strided.vec1 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>
2213 %strided.vec2 = shufflevector <96 x i16> %wide.vec, <96 x i16> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>
2214 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
2215 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
2216 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
2220 define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
2221 ; SSE-LABEL: load_i16_stride3_vf64:
2223 ; SSE-NEXT: subq $440, %rsp # imm = 0x1B8
2224 ; SSE-NEXT: movdqa 192(%rdi), %xmm14
2225 ; SSE-NEXT: movdqa 272(%rdi), %xmm6
2226 ; SSE-NEXT: movdqa 240(%rdi), %xmm5
2227 ; SSE-NEXT: movdqa 256(%rdi), %xmm7
2228 ; SSE-NEXT: movdqa 80(%rdi), %xmm10
2229 ; SSE-NEXT: movdqa (%rdi), %xmm15
2230 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
2231 ; SSE-NEXT: movdqa 32(%rdi), %xmm8
2232 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2233 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
2234 ; SSE-NEXT: movdqa 64(%rdi), %xmm11
2235 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
2236 ; SSE-NEXT: movdqa %xmm1, %xmm2
2237 ; SSE-NEXT: pandn %xmm11, %xmm2
2238 ; SSE-NEXT: movdqa %xmm0, %xmm3
2239 ; SSE-NEXT: pand %xmm1, %xmm3
2240 ; SSE-NEXT: por %xmm2, %xmm3
2241 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
2242 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2243 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
2244 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
2245 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,7,6,7]
2246 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2247 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,1,2,1]
2248 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2249 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
2250 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
2251 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
2252 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2253 ; SSE-NEXT: movdqa %xmm1, %xmm2
2254 ; SSE-NEXT: pandn %xmm7, %xmm2
2255 ; SSE-NEXT: movdqa %xmm5, %xmm3
2256 ; SSE-NEXT: pand %xmm1, %xmm3
2257 ; SSE-NEXT: por %xmm2, %xmm3
2258 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
2259 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2260 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
2261 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,3,2,1,4,5,6,7]
2262 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,7,6,7]
2263 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2264 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,1,2,1]
2265 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2266 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
2267 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,0],xmm2[2,0]
2268 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0]
2269 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2270 ; SSE-NEXT: movdqa %xmm1, %xmm2
2271 ; SSE-NEXT: pandn %xmm9, %xmm2
2272 ; SSE-NEXT: movdqa %xmm15, %xmm3
2273 ; SSE-NEXT: pand %xmm1, %xmm3
2274 ; SSE-NEXT: por %xmm2, %xmm3
2275 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,7,6,7]
2276 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2277 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1]
2278 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
2279 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm2[2,0]
2280 ; SSE-NEXT: movdqa 208(%rdi), %xmm8
2281 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
2282 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2283 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
2284 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
2285 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0]
2286 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2287 ; SSE-NEXT: movdqa %xmm1, %xmm2
2288 ; SSE-NEXT: pandn %xmm8, %xmm2
2289 ; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill
2290 ; SSE-NEXT: movdqa %xmm14, %xmm3
2291 ; SSE-NEXT: pand %xmm1, %xmm3
2292 ; SSE-NEXT: por %xmm2, %xmm3
2293 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
2294 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2295 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2296 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
2297 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,7,6,7]
2298 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0]
2299 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2300 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2301 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2302 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
2303 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
2304 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2305 ; SSE-NEXT: movdqa 160(%rdi), %xmm3
2306 ; SSE-NEXT: movdqa %xmm1, %xmm2
2307 ; SSE-NEXT: pandn %xmm3, %xmm2
2308 ; SSE-NEXT: movdqa %xmm3, %xmm4
2309 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2310 ; SSE-NEXT: movdqa 144(%rdi), %xmm3
2311 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2312 ; SSE-NEXT: pand %xmm1, %xmm3
2313 ; SSE-NEXT: por %xmm2, %xmm3
2314 ; SSE-NEXT: movdqa 176(%rdi), %xmm2
2315 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2316 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2317 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
2318 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2319 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0]
2320 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2321 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2322 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2323 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
2324 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
2325 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2326 ; SSE-NEXT: movdqa 352(%rdi), %xmm3
2327 ; SSE-NEXT: movdqa %xmm1, %xmm2
2328 ; SSE-NEXT: pandn %xmm3, %xmm2
2329 ; SSE-NEXT: movdqa %xmm3, %xmm4
2330 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2331 ; SSE-NEXT: movdqa 336(%rdi), %xmm3
2332 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2333 ; SSE-NEXT: pand %xmm1, %xmm3
2334 ; SSE-NEXT: por %xmm2, %xmm3
2335 ; SSE-NEXT: movdqa 368(%rdi), %xmm2
2336 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2337 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2338 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
2339 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2340 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0]
2341 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2342 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2343 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2344 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
2345 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
2346 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2347 ; SSE-NEXT: movdqa 112(%rdi), %xmm3
2348 ; SSE-NEXT: movdqa %xmm1, %xmm2
2349 ; SSE-NEXT: pandn %xmm3, %xmm2
2350 ; SSE-NEXT: movdqa %xmm3, %xmm4
2351 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2352 ; SSE-NEXT: movdqa 96(%rdi), %xmm3
2353 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2354 ; SSE-NEXT: pand %xmm1, %xmm3
2355 ; SSE-NEXT: por %xmm2, %xmm3
2356 ; SSE-NEXT: movdqa 128(%rdi), %xmm2
2357 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2358 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2359 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
2360 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2361 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm4[2,0]
2362 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2363 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
2364 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2365 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
2366 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
2367 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2368 ; SSE-NEXT: movdqa 288(%rdi), %xmm2
2369 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2370 ; SSE-NEXT: pand %xmm1, %xmm2
2371 ; SSE-NEXT: movdqa 304(%rdi), %xmm3
2372 ; SSE-NEXT: pandn %xmm3, %xmm1
2373 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2374 ; SSE-NEXT: por %xmm2, %xmm1
2375 ; SSE-NEXT: movdqa 320(%rdi), %xmm2
2376 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2377 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2378 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5]
2379 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
2380 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[2,0]
2381 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2382 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2383 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2384 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2385 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
2386 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2387 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535]
2388 ; SSE-NEXT: movdqa %xmm12, %xmm1
2389 ; SSE-NEXT: pandn %xmm0, %xmm1
2390 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2391 ; SSE-NEXT: movdqa %xmm0, %xmm2
2392 ; SSE-NEXT: movdqa %xmm12, %xmm0
2393 ; SSE-NEXT: pandn %xmm11, %xmm0
2394 ; SSE-NEXT: pand %xmm12, %xmm2
2395 ; SSE-NEXT: por %xmm0, %xmm2
2396 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7]
2397 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2398 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,5,6]
2399 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,0,0,0]
2400 ; SSE-NEXT: movdqa %xmm13, %xmm0
2401 ; SSE-NEXT: pandn %xmm3, %xmm0
2402 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
2403 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2404 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2405 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2406 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2407 ; SSE-NEXT: pand %xmm13, %xmm2
2408 ; SSE-NEXT: por %xmm2, %xmm0
2409 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2410 ; SSE-NEXT: movdqa %xmm12, %xmm0
2411 ; SSE-NEXT: pandn %xmm5, %xmm0
2412 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2413 ; SSE-NEXT: movdqa %xmm12, %xmm3
2414 ; SSE-NEXT: pandn %xmm7, %xmm3
2415 ; SSE-NEXT: pand %xmm12, %xmm5
2416 ; SSE-NEXT: por %xmm3, %xmm5
2417 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,3,2,3,4,5,6,7]
2418 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2419 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6]
2420 ; SSE-NEXT: movdqa %xmm13, %xmm0
2421 ; SSE-NEXT: pandn %xmm3, %xmm0
2422 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,1,2,3,4,5,6,7]
2423 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2424 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2425 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2426 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2427 ; SSE-NEXT: pand %xmm13, %xmm2
2428 ; SSE-NEXT: por %xmm2, %xmm0
2429 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2430 ; SSE-NEXT: movdqa %xmm12, %xmm8
2431 ; SSE-NEXT: pandn %xmm15, %xmm8
2432 ; SSE-NEXT: movdqa %xmm12, %xmm3
2433 ; SSE-NEXT: pandn %xmm9, %xmm3
2434 ; SSE-NEXT: pand %xmm12, %xmm15
2435 ; SSE-NEXT: por %xmm3, %xmm15
2436 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2437 ; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7]
2438 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2439 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6]
2440 ; SSE-NEXT: movdqa %xmm13, %xmm0
2441 ; SSE-NEXT: pandn %xmm3, %xmm0
2442 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7]
2443 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2444 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2445 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2446 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2447 ; SSE-NEXT: pand %xmm13, %xmm2
2448 ; SSE-NEXT: por %xmm2, %xmm0
2449 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2450 ; SSE-NEXT: movdqa %xmm12, %xmm0
2451 ; SSE-NEXT: pandn %xmm14, %xmm0
2452 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2453 ; SSE-NEXT: movdqa %xmm12, %xmm3
2454 ; SSE-NEXT: pandn (%rsp), %xmm3 # 16-byte Folded Reload
2455 ; SSE-NEXT: pand %xmm12, %xmm14
2456 ; SSE-NEXT: por %xmm3, %xmm14
2457 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2458 ; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7]
2459 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2460 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6]
2461 ; SSE-NEXT: movdqa %xmm13, %xmm0
2462 ; SSE-NEXT: pandn %xmm3, %xmm0
2463 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7]
2464 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2465 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2466 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2467 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2468 ; SSE-NEXT: pand %xmm13, %xmm2
2469 ; SSE-NEXT: por %xmm2, %xmm0
2470 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2471 ; SSE-NEXT: movdqa %xmm12, %xmm5
2472 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2473 ; SSE-NEXT: pandn %xmm2, %xmm5
2474 ; SSE-NEXT: movdqa %xmm12, %xmm3
2475 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2476 ; SSE-NEXT: pandn %xmm14, %xmm3
2477 ; SSE-NEXT: pand %xmm12, %xmm2
2478 ; SSE-NEXT: por %xmm3, %xmm2
2479 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2480 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,3,2,3,4,5,6,7]
2481 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2482 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6]
2483 ; SSE-NEXT: movdqa %xmm13, %xmm15
2484 ; SSE-NEXT: pandn %xmm3, %xmm15
2485 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
2486 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2487 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2488 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2489 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2490 ; SSE-NEXT: pand %xmm13, %xmm2
2491 ; SSE-NEXT: por %xmm2, %xmm15
2492 ; SSE-NEXT: movdqa %xmm12, %xmm0
2493 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2494 ; SSE-NEXT: pandn %xmm2, %xmm0
2495 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2496 ; SSE-NEXT: movdqa %xmm12, %xmm3
2497 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2498 ; SSE-NEXT: pand %xmm12, %xmm2
2499 ; SSE-NEXT: por %xmm3, %xmm2
2500 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2501 ; SSE-NEXT: # xmm3 = mem[0,3,2,3,4,5,6,7]
2502 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2503 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,5,6]
2504 ; SSE-NEXT: movdqa %xmm13, %xmm10
2505 ; SSE-NEXT: pandn %xmm3, %xmm10
2506 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
2507 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2508 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2509 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2510 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2511 ; SSE-NEXT: pand %xmm13, %xmm2
2512 ; SSE-NEXT: por %xmm2, %xmm10
2513 ; SSE-NEXT: movdqa %xmm12, %xmm2
2514 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2515 ; SSE-NEXT: pandn %xmm3, %xmm2
2516 ; SSE-NEXT: movdqa %xmm12, %xmm4
2517 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2518 ; SSE-NEXT: pandn %xmm9, %xmm4
2519 ; SSE-NEXT: pand %xmm12, %xmm3
2520 ; SSE-NEXT: por %xmm4, %xmm3
2521 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2522 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,3,2,3,4,5,6,7]
2523 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
2524 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,5,6]
2525 ; SSE-NEXT: movdqa %xmm13, %xmm6
2526 ; SSE-NEXT: pandn %xmm1, %xmm6
2527 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[2,1,2,3,4,5,6,7]
2528 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
2529 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2530 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2531 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2532 ; SSE-NEXT: pand %xmm13, %xmm1
2533 ; SSE-NEXT: por %xmm1, %xmm6
2534 ; SSE-NEXT: movdqa %xmm12, %xmm1
2535 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2536 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2537 ; SSE-NEXT: pand %xmm12, %xmm0
2538 ; SSE-NEXT: por %xmm1, %xmm0
2539 ; SSE-NEXT: pshuflw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2540 ; SSE-NEXT: # xmm1 = mem[0,3,2,3,4,5,6,7]
2541 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2542 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
2543 ; SSE-NEXT: movdqa %xmm13, %xmm3
2544 ; SSE-NEXT: pandn %xmm1, %xmm3
2545 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2546 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2547 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2548 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2549 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2550 ; SSE-NEXT: pand %xmm13, %xmm0
2551 ; SSE-NEXT: por %xmm0, %xmm3
2552 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2553 ; SSE-NEXT: pand %xmm12, %xmm0
2554 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2555 ; SSE-NEXT: movdqa %xmm0, %xmm1
2556 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2557 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7]
2558 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2559 ; SSE-NEXT: movdqa %xmm13, %xmm4
2560 ; SSE-NEXT: pandn %xmm0, %xmm4
2561 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
2562 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2563 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2564 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2565 ; SSE-NEXT: pand %xmm13, %xmm0
2566 ; SSE-NEXT: por %xmm0, %xmm4
2567 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2568 ; SSE-NEXT: pand %xmm12, %xmm0
2569 ; SSE-NEXT: por %xmm8, %xmm0
2570 ; SSE-NEXT: movdqa %xmm0, %xmm1
2571 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2572 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7]
2573 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2574 ; SSE-NEXT: movdqa %xmm13, %xmm8
2575 ; SSE-NEXT: pandn %xmm0, %xmm8
2576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
2577 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2578 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2579 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2580 ; SSE-NEXT: pand %xmm13, %xmm0
2581 ; SSE-NEXT: por %xmm0, %xmm8
2582 ; SSE-NEXT: pand %xmm12, %xmm14
2583 ; SSE-NEXT: por %xmm5, %xmm14
2584 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7]
2585 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2586 ; SSE-NEXT: movdqa %xmm13, %xmm5
2587 ; SSE-NEXT: pandn %xmm0, %xmm5
2588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,1,2,0]
2589 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2590 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2591 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2592 ; SSE-NEXT: pand %xmm13, %xmm0
2593 ; SSE-NEXT: por %xmm0, %xmm5
2594 ; SSE-NEXT: pand %xmm12, %xmm9
2595 ; SSE-NEXT: por %xmm2, %xmm9
2596 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,7,6,7]
2597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2598 ; SSE-NEXT: movdqa %xmm13, %xmm2
2599 ; SSE-NEXT: pandn %xmm0, %xmm2
2600 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0]
2601 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2602 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2603 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2604 ; SSE-NEXT: pand %xmm13, %xmm0
2605 ; SSE-NEXT: por %xmm0, %xmm2
2606 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2607 ; SSE-NEXT: pand %xmm12, %xmm1
2608 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2609 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2610 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7]
2611 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2612 ; SSE-NEXT: movdqa %xmm13, %xmm14
2613 ; SSE-NEXT: pandn %xmm0, %xmm14
2614 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
2615 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2616 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2617 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2618 ; SSE-NEXT: pand %xmm13, %xmm0
2619 ; SSE-NEXT: por %xmm0, %xmm14
2620 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
2621 ; SSE-NEXT: pand %xmm12, %xmm0
2622 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2623 ; SSE-NEXT: movdqa %xmm0, %xmm1
2624 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2625 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7]
2626 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2627 ; SSE-NEXT: movdqa %xmm13, %xmm11
2628 ; SSE-NEXT: pandn %xmm0, %xmm11
2629 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
2630 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2631 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2632 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2633 ; SSE-NEXT: pand %xmm13, %xmm0
2634 ; SSE-NEXT: por %xmm0, %xmm11
2635 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2636 ; SSE-NEXT: pand %xmm12, %xmm0
2637 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2638 ; SSE-NEXT: movdqa %xmm0, %xmm1
2639 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2640 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,7,6,7]
2641 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
2642 ; SSE-NEXT: movdqa %xmm13, %xmm9
2643 ; SSE-NEXT: pandn %xmm0, %xmm9
2644 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,0]
2645 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2646 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2647 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2648 ; SSE-NEXT: pand %xmm13, %xmm0
2649 ; SSE-NEXT: por %xmm0, %xmm9
2650 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2651 ; SSE-NEXT: pand %xmm12, %xmm0
2652 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2653 ; SSE-NEXT: por %xmm0, %xmm12
2654 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,1,2,0]
2655 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2656 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2657 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
2658 ; SSE-NEXT: pand %xmm13, %xmm0
2659 ; SSE-NEXT: pshufhw $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2660 ; SSE-NEXT: # xmm12 = mem[0,1,2,3,4,7,6,7]
2661 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,2]
2662 ; SSE-NEXT: pandn %xmm12, %xmm13
2663 ; SSE-NEXT: por %xmm0, %xmm13
2664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2665 ; SSE-NEXT: movaps %xmm0, 96(%rsi)
2666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2667 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
2668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2669 ; SSE-NEXT: movaps %xmm0, 112(%rsi)
2670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2671 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
2672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2673 ; SSE-NEXT: movaps %xmm0, 64(%rsi)
2674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2675 ; SSE-NEXT: movaps %xmm0, (%rsi)
2676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2677 ; SSE-NEXT: movaps %xmm0, 80(%rsi)
2678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2679 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
2680 ; SSE-NEXT: movdqa %xmm3, 96(%rdx)
2681 ; SSE-NEXT: movdqa %xmm6, 32(%rdx)
2682 ; SSE-NEXT: movdqa %xmm10, 112(%rdx)
2683 ; SSE-NEXT: movdqa %xmm15, 48(%rdx)
2684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2685 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
2686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2687 ; SSE-NEXT: movaps %xmm0, (%rdx)
2688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2689 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
2690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2691 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
2692 ; SSE-NEXT: movdqa %xmm13, 96(%rcx)
2693 ; SSE-NEXT: movdqa %xmm9, 112(%rcx)
2694 ; SSE-NEXT: movdqa %xmm11, 64(%rcx)
2695 ; SSE-NEXT: movdqa %xmm14, 80(%rcx)
2696 ; SSE-NEXT: movdqa %xmm2, 32(%rcx)
2697 ; SSE-NEXT: movdqa %xmm5, 48(%rcx)
2698 ; SSE-NEXT: movdqa %xmm8, (%rcx)
2699 ; SSE-NEXT: movdqa %xmm4, 16(%rcx)
2700 ; SSE-NEXT: addq $440, %rsp # imm = 0x1B8
2703 ; AVX-LABEL: load_i16_stride3_vf64:
2705 ; AVX-NEXT: subq $408, %rsp # imm = 0x198
2706 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm6
2707 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm5
2708 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
2709 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
2710 ; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
2711 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm2
2712 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2713 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
2714 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2715 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
2716 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm10
2717 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm7
2718 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
2719 ; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
2720 ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2721 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm0
2722 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2723 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
2724 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
2725 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
2726 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
2727 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2728 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm0
2729 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
2730 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2731 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
2732 ; AVX-NEXT: vmovdqa %xmm0, %xmm14
2733 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2734 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
2735 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm9
2736 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7]
2737 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
2738 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
2739 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
2740 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2741 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
2742 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2743 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
2744 ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2745 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm0
2746 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2747 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
2748 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
2749 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
2750 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
2751 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2752 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
2753 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2754 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
2755 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2756 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
2757 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
2758 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
2759 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2760 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
2761 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
2762 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
2763 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm0
2764 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2765 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm3
2766 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2767 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
2768 ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
2769 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm0
2770 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2771 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
2772 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
2773 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
2774 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
2775 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2776 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm13
2777 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm11
2778 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
2779 ; AVX-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill
2780 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2781 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
2782 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
2783 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
2784 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2785 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
2786 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
2787 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm4
2788 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
2789 ; AVX-NEXT: vmovdqa %xmm0, %xmm15
2790 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2791 ; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm0
2792 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
2793 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1]
2794 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2795 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5]
2796 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm12[6,7]
2797 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2798 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2799 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
2800 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2801 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7]
2802 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
2803 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2804 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7]
2805 ; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2806 ; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
2807 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
2808 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
2809 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2810 ; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
2811 ; AVX-NEXT: # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7]
2812 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
2813 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2814 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2815 ; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2816 ; AVX-NEXT: # xmm12 = xmm12[0,1],mem[2],xmm12[3,4],mem[5],xmm12[6,7]
2817 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2818 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7]
2819 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
2820 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
2821 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2822 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
2823 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
2824 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2825 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
2826 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1,2],xmm1[3],xmm12[4,5],xmm1[6],xmm12[7]
2827 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
2828 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
2829 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2830 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2831 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2832 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
2833 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2834 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7]
2835 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2836 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2837 ; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2838 ; AVX-NEXT: # xmm12 = mem[0,1],xmm12[2],mem[3,4],xmm12[5],mem[6,7]
2839 ; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2840 ; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
2841 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2
2842 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
2843 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2844 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
2845 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
2846 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2847 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,2,3]
2848 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
2849 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
2850 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7]
2851 ; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
2852 ; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3],mem[4],xmm12[5,6],mem[7]
2853 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
2854 ; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm12
2855 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm8
2856 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2857 ; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2858 ; AVX-NEXT: # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7]
2859 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
2860 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,2,3]
2861 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
2862 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
2863 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2864 ; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload
2865 ; AVX-NEXT: # xmm10 = mem[0,1],xmm5[2],mem[3,4],xmm5[5],mem[6,7]
2866 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6],xmm14[7]
2867 ; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9
2868 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
2869 ; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
2870 ; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
2871 ; AVX-NEXT: # xmm7 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7]
2872 ; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm7
2873 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
2874 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
2875 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
2876 ; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2877 ; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
2878 ; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
2879 ; AVX-NEXT: # xmm3 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
2880 ; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
2881 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
2882 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7]
2883 ; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm2
2884 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3]
2885 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
2886 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
2887 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2888 ; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2889 ; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
2890 ; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
2891 ; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
2892 ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
2893 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2894 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2895 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
2896 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2897 ; AVX-NEXT: vmovaps %ymm2, 64(%rsi)
2898 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2899 ; AVX-NEXT: vmovaps %ymm2, 96(%rsi)
2900 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2901 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
2902 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2903 ; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
2904 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2905 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
2906 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2907 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx)
2908 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2909 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
2910 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
2911 ; AVX-NEXT: vmovaps %ymm3, (%rcx)
2912 ; AVX-NEXT: vmovaps %ymm0, 96(%rcx)
2913 ; AVX-NEXT: vmovaps %ymm8, 32(%rcx)
2914 ; AVX-NEXT: addq $408, %rsp # imm = 0x198
2915 ; AVX-NEXT: vzeroupper
2918 ; AVX2-LABEL: load_i16_stride3_vf64:
2920 ; AVX2-NEXT: subq $136, %rsp
2921 ; AVX2-NEXT: vmovdqa (%rdi), %ymm1
2922 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
2923 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm4
2924 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5
2925 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm9
2926 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10
2927 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12
2928 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm13
2929 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
2930 ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
2931 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
2932 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
2933 ; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
2934 ; AVX2-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
2935 ; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
2936 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
2937 ; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
2938 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2939 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
2940 ; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
2941 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2942 ; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
2943 ; AVX2-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
2944 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2945 ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
2946 ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
2947 ; AVX2-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
2948 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
2949 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
2950 ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
2951 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
2952 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm7
2953 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
2954 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
2955 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
2956 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2957 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
2958 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
2959 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2960 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2961 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
2962 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
2963 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
2964 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
2965 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm15
2966 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm4
2967 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
2968 ; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
2969 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2970 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
2971 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2972 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2973 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2974 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
2975 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
2976 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
2977 ; AVX2-NEXT: vmovdqa %ymm3, %ymm2
2978 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm8
2979 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm3
2980 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
2981 ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm12
2982 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2983 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
2984 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2985 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
2986 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2987 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
2988 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
2989 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm12
2990 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
2991 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11
2992 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
2993 ; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0
2994 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2995 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
2996 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
2997 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
2998 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2999 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3000 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
3001 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
3002 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3003 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm0
3004 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
3005 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3006 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3007 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3008 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
3009 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
3010 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
3011 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
3012 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
3013 ; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3014 ; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
3015 ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm13
3016 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3017 ; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
3018 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
3019 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3020 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
3021 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
3022 ; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10
3023 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3024 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3025 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3026 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
3027 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
3028 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
3029 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
3030 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
3031 ; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm9
3032 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
3033 ; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm1
3034 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3035 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
3036 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
3037 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
3038 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3039 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
3040 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
3041 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
3042 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3043 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3044 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3045 ; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
3046 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3047 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
3048 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3049 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
3050 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
3051 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
3052 ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3053 ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
3054 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3055 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
3056 ; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
3057 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3058 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3059 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
3060 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3061 ; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
3062 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3063 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
3064 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
3065 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
3066 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3067 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
3068 ; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2
3069 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3070 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
3071 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3072 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
3073 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3074 ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
3075 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3076 ; AVX2-NEXT: vmovaps %ymm4, 96(%rsi)
3077 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3078 ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
3079 ; AVX2-NEXT: vmovdqa %ymm10, 64(%rdx)
3080 ; AVX2-NEXT: vmovdqa %ymm14, (%rdx)
3081 ; AVX2-NEXT: vmovdqa %ymm13, 96(%rdx)
3082 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
3083 ; AVX2-NEXT: vmovdqa %ymm3, 64(%rcx)
3084 ; AVX2-NEXT: vmovdqa %ymm2, (%rcx)
3085 ; AVX2-NEXT: vmovdqa %ymm1, 96(%rcx)
3086 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rcx)
3087 ; AVX2-NEXT: addq $136, %rsp
3088 ; AVX2-NEXT: vzeroupper
3091 ; AVX2-FP-LABEL: load_i16_stride3_vf64:
3093 ; AVX2-FP-NEXT: subq $136, %rsp
3094 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
3095 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2
3096 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm4
3097 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5
3098 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm9
3099 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10
3100 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm12
3101 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm13
3102 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3103 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
3104 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
3105 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
3106 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3107 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
3108 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
3109 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3110 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
3111 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3112 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
3113 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
3114 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3115 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
3116 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
3117 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3118 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
3119 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
3120 ; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
3121 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5
3122 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
3123 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
3124 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
3125 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm7
3126 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
3127 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3128 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
3129 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3130 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
3131 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
3132 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3133 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3134 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
3135 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
3136 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3137 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
3138 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm15
3139 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm4
3140 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
3141 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
3142 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3143 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
3144 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
3145 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3146 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3147 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
3148 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
3149 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
3150 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm2
3151 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm8
3152 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm3
3153 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
3154 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm12
3155 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3156 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
3157 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
3158 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
3159 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3160 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
3161 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
3162 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm12
3163 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2
3164 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11
3165 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
3166 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
3167 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3168 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
3169 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
3170 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
3171 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3172 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3173 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
3174 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
3175 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3176 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
3177 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
3178 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3179 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3180 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3181 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
3182 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
3183 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
3184 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
3185 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
3186 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3187 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
3188 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm13
3189 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3190 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
3191 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
3192 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3193 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
3194 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
3195 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
3196 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3197 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3198 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3199 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
3200 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
3201 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
3202 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
3203 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
3204 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
3205 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
3206 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
3207 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3208 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
3209 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
3210 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
3211 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3212 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
3213 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
3214 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
3215 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3216 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3217 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3218 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm5
3219 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3220 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
3221 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3222 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
3223 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
3224 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
3225 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3226 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
3227 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3228 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
3229 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
3230 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3231 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3232 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
3233 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3234 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
3235 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3236 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
3237 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
3238 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
3239 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3240 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
3241 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
3242 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3243 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
3244 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3245 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
3246 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3247 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
3248 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3249 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi)
3250 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3251 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
3252 ; AVX2-FP-NEXT: vmovdqa %ymm10, 64(%rdx)
3253 ; AVX2-FP-NEXT: vmovdqa %ymm14, (%rdx)
3254 ; AVX2-FP-NEXT: vmovdqa %ymm13, 96(%rdx)
3255 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx)
3256 ; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rcx)
3257 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx)
3258 ; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%rcx)
3259 ; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rcx)
3260 ; AVX2-FP-NEXT: addq $136, %rsp
3261 ; AVX2-FP-NEXT: vzeroupper
3262 ; AVX2-FP-NEXT: retq
3264 ; AVX2-FCP-LABEL: load_i16_stride3_vf64:
3265 ; AVX2-FCP: # %bb.0:
3266 ; AVX2-FCP-NEXT: subq $136, %rsp
3267 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
3268 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
3269 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
3270 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
3271 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm9
3272 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10
3273 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
3274 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
3275 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3276 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
3277 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
3278 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
3279 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3280 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
3281 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
3282 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
3283 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
3284 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3285 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
3286 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
3287 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3288 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
3289 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
3290 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3291 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
3292 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
3293 ; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
3294 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5
3295 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
3296 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
3297 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
3298 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
3299 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
3300 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3301 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
3302 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3303 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
3304 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
3305 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3306 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3307 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
3308 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
3309 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3310 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
3311 ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm15
3312 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm4
3313 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
3314 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
3315 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3316 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
3317 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
3318 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
3319 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3320 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
3321 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
3322 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
3323 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm2
3324 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
3325 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
3326 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
3327 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm12
3328 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3329 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
3330 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
3331 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
3332 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3333 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
3334 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
3335 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm12
3336 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
3337 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11
3338 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
3339 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
3340 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3341 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
3342 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
3343 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
3344 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3345 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3346 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
3347 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
3348 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3349 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
3350 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
3351 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3352 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3353 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3354 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
3355 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
3356 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
3357 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
3358 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
3359 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3360 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
3361 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13
3362 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3363 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
3364 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
3365 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3366 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
3367 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
3368 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
3369 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3370 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
3371 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3372 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
3373 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
3374 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
3375 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
3376 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
3377 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
3378 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
3379 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
3380 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3381 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
3382 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
3383 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
3384 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
3385 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
3386 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
3387 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
3388 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3389 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3390 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3391 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5
3392 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3393 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
3394 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3395 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
3396 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
3397 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
3398 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
3399 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
3400 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3401 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
3402 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
3403 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3404 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3405 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
3406 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3407 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
3408 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3409 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
3410 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
3411 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
3412 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
3413 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
3414 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
3415 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3416 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
3417 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3418 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
3419 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3420 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
3421 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3422 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi)
3423 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3424 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
3425 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 64(%rdx)
3426 ; AVX2-FCP-NEXT: vmovdqa %ymm14, (%rdx)
3427 ; AVX2-FCP-NEXT: vmovdqa %ymm13, 96(%rdx)
3428 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx)
3429 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rcx)
3430 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx)
3431 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rcx)
3432 ; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rcx)
3433 ; AVX2-FCP-NEXT: addq $136, %rsp
3434 ; AVX2-FCP-NEXT: vzeroupper
3435 ; AVX2-FCP-NEXT: retq
3437 ; AVX512-LABEL: load_i16_stride3_vf64:
3439 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3440 ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
3441 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
3442 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1
3443 ; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
3444 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3445 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
3446 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3447 ; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5
3448 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1
3449 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2
3450 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
3451 ; AVX512-NEXT: vmovdqa %xmm2, %xmm3
3452 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
3453 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3454 ; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6
3455 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3456 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
3457 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
3458 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
3459 ; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21
3460 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22
3461 ; AVX512-NEXT: vmovdqa %ymm0, %ymm8
3462 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
3463 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3464 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
3465 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
3466 ; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
3467 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm1
3468 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2
3469 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
3470 ; AVX512-NEXT: vmovdqa %xmm2, %xmm4
3471 ; AVX512-NEXT: vmovdqa %xmm1, %xmm8
3472 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
3473 ; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3474 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
3475 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
3476 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23
3477 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
3478 ; AVX512-NEXT: vmovdqa %ymm0, %ymm5
3479 ; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
3480 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
3481 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
3482 ; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10
3483 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15
3484 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
3485 ; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
3486 ; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3487 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
3488 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24
3489 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
3490 ; AVX512-NEXT: vmovdqa %ymm0, %ymm10
3491 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
3492 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
3493 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
3494 ; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
3495 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
3496 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
3497 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
3498 ; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2
3499 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3500 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
3501 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
3502 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3503 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
3504 ; AVX512-NEXT: vmovdqa %ymm0, %ymm2
3505 ; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
3506 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
3507 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
3508 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
3509 ; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3510 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
3511 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25
3512 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26
3513 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
3514 ; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7
3515 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
3516 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3517 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
3518 ; AVX512-NEXT: vmovdqa %ymm13, %ymm2
3519 ; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
3520 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
3521 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
3522 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3523 ; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
3524 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8
3525 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3526 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
3527 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3528 ; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14
3529 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3530 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
3531 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
3532 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
3533 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
3534 ; AVX512-NEXT: vmovdqa %ymm0, %ymm2
3535 ; AVX512-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
3536 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
3537 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
3538 ; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3539 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
3540 ; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6
3541 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
3542 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
3543 ; AVX512-NEXT: vmovdqa %ymm13, %ymm6
3544 ; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
3545 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3546 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
3547 ; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4
3548 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
3549 ; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3550 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3551 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
3552 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
3553 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3554 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
3555 ; AVX512-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
3556 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
3557 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
3558 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3559 ; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3560 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
3561 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
3562 ; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3563 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3564 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3565 ; AVX512-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
3566 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3567 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3568 ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
3569 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
3570 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3571 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
3572 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3573 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
3574 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
3575 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
3576 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
3577 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
3578 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
3579 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7
3580 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
3581 ; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3582 ; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3583 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3584 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3585 ; AVX512-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
3586 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
3587 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
3588 ; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3589 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
3590 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
3591 ; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
3592 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3593 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
3594 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
3595 ; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
3596 ; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi)
3597 ; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3598 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
3599 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx)
3600 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
3601 ; AVX512-NEXT: vzeroupper
3604 ; AVX512-FCP-LABEL: load_i16_stride3_vf64:
3605 ; AVX512-FCP: # %bb.0:
3606 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3607 ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
3608 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
3609 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
3610 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
3611 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3612 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
3613 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3614 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
3615 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
3616 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
3617 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
3618 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm3
3619 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
3620 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3621 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
3622 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3623 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
3624 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
3625 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
3626 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
3627 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
3628 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8
3629 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
3630 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3631 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
3632 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
3633 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
3634 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
3635 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
3636 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
3637 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm4
3638 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm8
3639 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
3640 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3641 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
3642 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
3643 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
3644 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
3645 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
3646 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
3647 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
3648 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
3649 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
3650 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
3651 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
3652 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
3653 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3654 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
3655 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
3656 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
3657 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
3658 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
3659 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
3660 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
3661 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
3662 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
3663 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
3664 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
3665 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
3666 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3667 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
3668 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
3669 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3670 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
3671 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
3672 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
3673 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
3674 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
3675 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
3676 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3677 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
3678 ; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
3679 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
3680 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
3681 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
3682 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
3683 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3684 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
3685 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
3686 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
3687 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
3688 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
3689 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3690 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
3691 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
3692 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3693 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
3694 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3695 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
3696 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3697 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
3698 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
3699 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
3700 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
3701 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
3702 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
3703 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
3704 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
3705 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3706 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
3707 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
3708 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
3709 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
3710 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6
3711 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
3712 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3713 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
3714 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
3715 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
3716 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3717 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3718 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
3719 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
3720 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3721 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
3722 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
3723 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
3724 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
3725 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3726 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3727 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
3728 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
3729 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3730 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3731 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3732 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
3733 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3734 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3735 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
3736 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
3737 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3738 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
3739 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3740 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
3741 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
3742 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
3743 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
3744 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
3745 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
3746 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
3747 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
3748 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3749 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3750 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3751 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3752 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
3753 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
3754 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
3755 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3756 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
3757 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
3758 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
3759 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3760 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
3761 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
3762 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
3763 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
3764 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3765 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
3766 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
3767 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
3768 ; AVX512-FCP-NEXT: vzeroupper
3769 ; AVX512-FCP-NEXT: retq
3771 ; AVX512DQ-LABEL: load_i16_stride3_vf64:
3772 ; AVX512DQ: # %bb.0:
3773 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3774 ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
3775 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
3776 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
3777 ; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
3778 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3779 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
3780 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3781 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5
3782 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1
3783 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2
3784 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
3785 ; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm3
3786 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
3787 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3788 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6
3789 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3790 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
3791 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
3792 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
3793 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21
3794 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22
3795 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
3796 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
3797 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3798 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
3799 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
3800 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11
3801 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm1
3802 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2
3803 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
3804 ; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm4
3805 ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm8
3806 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
3807 ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3808 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
3809 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
3810 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23
3811 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11
3812 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5
3813 ; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
3814 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
3815 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
3816 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10
3817 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15
3818 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
3819 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
3820 ; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3821 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
3822 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24
3823 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
3824 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
3825 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
3826 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
3827 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
3828 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7
3829 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
3830 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
3831 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
3832 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2
3833 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3834 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
3835 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
3836 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3837 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
3838 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
3839 ; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
3840 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
3841 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
3842 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
3843 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3844 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
3845 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25
3846 ; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26
3847 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
3848 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7
3849 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
3850 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
3851 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
3852 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2
3853 ; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
3854 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
3855 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
3856 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
3857 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
3858 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8
3859 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
3860 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27
3861 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
3862 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14
3863 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3864 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
3865 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
3866 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
3867 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
3868 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
3869 ; AVX512DQ-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
3870 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
3871 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
3872 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
3873 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
3874 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6
3875 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
3876 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
3877 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6
3878 ; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
3879 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3880 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
3881 ; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4
3882 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
3883 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3884 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3885 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
3886 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
3887 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3888 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
3889 ; AVX512DQ-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
3890 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
3891 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
3892 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
3893 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3894 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
3895 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
3896 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3897 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3898 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3899 ; AVX512DQ-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
3900 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
3901 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
3902 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
3903 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
3904 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
3905 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
3906 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3907 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
3908 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
3909 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
3910 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
3911 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
3912 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
3913 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
3914 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
3915 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3916 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
3917 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
3918 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3919 ; AVX512DQ-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
3920 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
3921 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
3922 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3923 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
3924 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
3925 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4
3926 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3927 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
3928 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
3929 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
3930 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi)
3931 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx)
3932 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
3933 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx)
3934 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
3935 ; AVX512DQ-NEXT: vzeroupper
3936 ; AVX512DQ-NEXT: retq
3938 ; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64:
3939 ; AVX512DQ-FCP: # %bb.0:
3940 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
3941 ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
3942 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
3943 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
3944 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm18, %ymm20, %ymm1
3945 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
3946 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
3947 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
3948 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
3949 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
3950 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
3951 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
3952 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm3
3953 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
3954 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
3955 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
3956 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3957 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
3958 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
3959 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
3960 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
3961 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
3962 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
3963 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm8
3964 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3965 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
3966 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
3967 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
3968 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
3969 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
3970 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
3971 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm4
3972 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm8
3973 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
3974 ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3975 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
3976 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
3977 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
3978 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
3979 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5
3980 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm11, %ymm5
3981 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
3982 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
3983 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
3984 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
3985 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
3986 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
3987 ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
3988 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
3989 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
3990 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
3991 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
3992 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm24, %ymm10
3993 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
3994 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
3995 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
3996 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
3997 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
3998 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
3999 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
4000 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4001 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
4002 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
4003 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
4004 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
4005 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
4006 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm2
4007 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
4008 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
4009 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
4010 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
4011 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
4012 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
4013 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
4014 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
4015 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
4016 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
4017 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
4018 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
4019 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2
4020 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm2
4021 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
4022 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
4023 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
4024 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
4025 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
4026 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
4027 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
4028 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
4029 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
4030 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
4031 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
4032 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
4033 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
4034 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
4035 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
4036 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm23, %ymm2
4037 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
4038 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
4039 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
4040 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
4041 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
4042 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
4043 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
4044 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6
4045 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm12, %ymm6
4046 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
4047 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
4048 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
4049 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
4050 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
4051 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4052 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
4053 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
4054 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
4055 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
4056 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm23, %ymm13, %ymm11
4057 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
4058 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
4059 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
4060 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
4061 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
4062 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
4063 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
4064 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
4065 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
4066 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm24, %ymm0, %ymm12
4067 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
4068 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
4069 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
4070 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
4071 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
4072 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
4073 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4074 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
4075 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
4076 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm13
4077 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
4078 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
4079 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
4080 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
4081 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
4082 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
4083 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
4084 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
4085 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
4086 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm20, %ymm18, %ymm0
4087 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
4088 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
4089 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
4090 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
4091 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
4092 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
4093 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
4094 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
4095 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
4096 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
4097 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
4098 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
4099 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
4100 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
4101 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
4102 ; AVX512DQ-FCP-NEXT: vzeroupper
4103 ; AVX512DQ-FCP-NEXT: retq
4105 ; AVX512BW-LABEL: load_i16_stride3_vf64:
4106 ; AVX512BW: # %bb.0:
4107 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0
4108 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1
4109 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
4110 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3
4111 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
4112 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
4113 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
4114 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7
4115 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7
4116 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
4117 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7
4118 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4119 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6
4120 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
4121 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9
4122 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
4123 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
4124 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9
4125 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4126 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8
4127 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
4128 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1
4129 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
4130 ; AVX512BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1
4131 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3
4132 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3
4133 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4134 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi)
4135 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
4136 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx)
4137 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 64(%rcx)
4138 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rcx)
4139 ; AVX512BW-NEXT: vzeroupper
4140 ; AVX512BW-NEXT: retq
4142 ; AVX512BW-FCP-LABEL: load_i16_stride3_vf64:
4143 ; AVX512BW-FCP: # %bb.0:
4144 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
4145 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
4146 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
4147 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
4148 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
4149 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
4150 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
4151 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
4152 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7
4153 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
4154 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7
4155 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4156 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6
4157 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
4158 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
4159 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
4160 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
4161 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9
4162 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4163 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8
4164 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
4165 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1
4166 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
4167 ; AVX512BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1
4168 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3
4169 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3
4170 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4171 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
4172 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
4173 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
4174 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
4175 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
4176 ; AVX512BW-FCP-NEXT: vzeroupper
4177 ; AVX512BW-FCP-NEXT: retq
4179 ; AVX512DQ-BW-LABEL: load_i16_stride3_vf64:
4180 ; AVX512DQ-BW: # %bb.0:
4181 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0
4182 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1
4183 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
4184 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3
4185 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4
4186 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5
4187 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
4188 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7
4189 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm6, %zmm7
4190 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
4191 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm8, %zmm7
4192 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4193 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm6
4194 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
4195 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9
4196 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
4197 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
4198 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm10, %zmm9
4199 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4200 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm8
4201 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
4202 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm1
4203 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
4204 ; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm5, %zmm1
4205 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm10, %zmm3
4206 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm5, %zmm3
4207 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4208 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rsi)
4209 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rdx)
4210 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx)
4211 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 64(%rcx)
4212 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rcx)
4213 ; AVX512DQ-BW-NEXT: vzeroupper
4214 ; AVX512DQ-BW-NEXT: retq
4216 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride3_vf64:
4217 ; AVX512DQ-BW-FCP: # %bb.0:
4218 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
4219 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1
4220 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
4221 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
4222 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
4223 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
4224 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,0,0,0,0,0,0,0,0,0,0]
4225 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
4226 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm6, %zmm7
4227 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
4228 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm8, %zmm7
4229 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4230 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm6
4231 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,0,0,0,0,0,0,0,0,0,0,0]
4232 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
4233 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
4234 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
4235 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm10, %zmm9
4236 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4237 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm8
4238 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0,0,0,0,0,0,0]
4239 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm1
4240 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
4241 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm0, %zmm5, %zmm1
4242 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm10, %zmm3
4243 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm3
4244 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi)
4245 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
4246 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx)
4247 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
4248 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
4249 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
4250 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4251 ; AVX512DQ-BW-FCP-NEXT: retq
4252 %wide.vec = load <192 x i16>, ptr %in.vec, align 64
4253 %strided.vec0 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
4254 %strided.vec1 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
4255 %strided.vec2 = shufflevector <192 x i16> %wide.vec, <192 x i16> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
4256 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
4257 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
4258 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64