1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i16_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i16_stride2_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
23 ; SSE-NEXT: movq %xmm0, (%rdx)
26 ; AVX-LABEL: store_i16_stride2_vf2:
28 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
29 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
30 ; AVX-NEXT: vmovq %xmm0, (%rdx)
33 ; AVX2-LABEL: store_i16_stride2_vf2:
35 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
36 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
37 ; AVX2-NEXT: vmovq %xmm0, (%rdx)
40 ; AVX2-FP-LABEL: store_i16_stride2_vf2:
42 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
43 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
44 ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
47 ; AVX2-FCP-LABEL: store_i16_stride2_vf2:
49 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
50 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
51 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
54 ; AVX512-LABEL: store_i16_stride2_vf2:
56 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
58 ; AVX512-NEXT: vmovq %xmm0, (%rdx)
61 ; AVX512-FCP-LABEL: store_i16_stride2_vf2:
62 ; AVX512-FCP: # %bb.0:
63 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
64 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
65 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
66 ; AVX512-FCP-NEXT: retq
68 ; AVX512DQ-LABEL: store_i16_stride2_vf2:
70 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
71 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
72 ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
75 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf2:
76 ; AVX512DQ-FCP: # %bb.0:
77 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
78 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
79 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
80 ; AVX512DQ-FCP-NEXT: retq
82 ; AVX512BW-LABEL: store_i16_stride2_vf2:
84 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
85 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
86 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
89 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf2:
90 ; AVX512BW-FCP: # %bb.0:
91 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
92 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
93 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
94 ; AVX512BW-FCP-NEXT: retq
96 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf2:
97 ; AVX512DQ-BW: # %bb.0:
98 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
99 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
100 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
101 ; AVX512DQ-BW-NEXT: retq
103 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf2:
104 ; AVX512DQ-BW-FCP: # %bb.0:
105 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
106 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
107 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
108 ; AVX512DQ-BW-FCP-NEXT: retq
109 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
110 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
111 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112 %interleaved.vec = shufflevector <4 x i16> %1, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
113 store <4 x i16> %interleaved.vec, ptr %out.vec, align 64
117 define void @store_i16_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
118 ; SSE-LABEL: store_i16_stride2_vf4:
120 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
121 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
122 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
123 ; SSE-NEXT: movdqa %xmm1, (%rdx)
126 ; AVX-LABEL: store_i16_stride2_vf4:
128 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
129 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
130 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
131 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
134 ; AVX2-LABEL: store_i16_stride2_vf4:
136 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
137 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
138 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
139 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
142 ; AVX2-FP-LABEL: store_i16_stride2_vf4:
144 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
145 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
146 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
147 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx)
150 ; AVX2-FCP-LABEL: store_i16_stride2_vf4:
152 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
153 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
154 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
155 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx)
156 ; AVX2-FCP-NEXT: retq
158 ; AVX512-LABEL: store_i16_stride2_vf4:
160 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
161 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
162 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
163 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
166 ; AVX512-FCP-LABEL: store_i16_stride2_vf4:
167 ; AVX512-FCP: # %bb.0:
168 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
169 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
170 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
171 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
172 ; AVX512-FCP-NEXT: retq
174 ; AVX512DQ-LABEL: store_i16_stride2_vf4:
176 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
177 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
178 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
179 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
180 ; AVX512DQ-NEXT: retq
182 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf4:
183 ; AVX512DQ-FCP: # %bb.0:
184 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
185 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
186 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
187 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
188 ; AVX512DQ-FCP-NEXT: retq
190 ; AVX512BW-LABEL: store_i16_stride2_vf4:
192 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
193 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
194 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
195 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
196 ; AVX512BW-NEXT: retq
198 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf4:
199 ; AVX512BW-FCP: # %bb.0:
200 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
201 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
202 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
203 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
204 ; AVX512BW-FCP-NEXT: retq
206 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf4:
207 ; AVX512DQ-BW: # %bb.0:
208 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
209 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
210 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
211 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
212 ; AVX512DQ-BW-NEXT: retq
214 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf4:
215 ; AVX512DQ-BW-FCP: # %bb.0:
216 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
217 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
218 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
219 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
220 ; AVX512DQ-BW-FCP-NEXT: retq
221 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
222 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64
223 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
224 %interleaved.vec = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
225 store <8 x i16> %interleaved.vec, ptr %out.vec, align 64
229 define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
230 ; SSE-LABEL: store_i16_stride2_vf8:
232 ; SSE-NEXT: movdqa (%rdi), %xmm0
233 ; SSE-NEXT: movdqa (%rsi), %xmm1
234 ; SSE-NEXT: movdqa %xmm0, %xmm2
235 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
236 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
237 ; SSE-NEXT: movdqa %xmm0, 16(%rdx)
238 ; SSE-NEXT: movdqa %xmm2, (%rdx)
241 ; AVX-LABEL: store_i16_stride2_vf8:
243 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
244 ; AVX-NEXT: vmovdqa (%rsi), %xmm1
245 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
246 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
247 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
248 ; AVX-NEXT: vmovdqa %xmm2, (%rdx)
251 ; AVX2-LABEL: store_i16_stride2_vf8:
253 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
254 ; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
255 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
256 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
257 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
258 ; AVX2-NEXT: vzeroupper
261 ; AVX2-FP-LABEL: store_i16_stride2_vf8:
263 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
264 ; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
265 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
266 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
267 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
268 ; AVX2-FP-NEXT: vzeroupper
271 ; AVX2-FCP-LABEL: store_i16_stride2_vf8:
273 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
274 ; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
275 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
276 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
277 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
278 ; AVX2-FCP-NEXT: vzeroupper
279 ; AVX2-FCP-NEXT: retq
281 ; AVX512-LABEL: store_i16_stride2_vf8:
283 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
284 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
285 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
286 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
287 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
288 ; AVX512-NEXT: vzeroupper
291 ; AVX512-FCP-LABEL: store_i16_stride2_vf8:
292 ; AVX512-FCP: # %bb.0:
293 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
294 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
295 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
296 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
297 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx)
298 ; AVX512-FCP-NEXT: vzeroupper
299 ; AVX512-FCP-NEXT: retq
301 ; AVX512DQ-LABEL: store_i16_stride2_vf8:
303 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
304 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
305 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
306 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
307 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
308 ; AVX512DQ-NEXT: vzeroupper
309 ; AVX512DQ-NEXT: retq
311 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf8:
312 ; AVX512DQ-FCP: # %bb.0:
313 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
314 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
315 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
316 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31]
317 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx)
318 ; AVX512DQ-FCP-NEXT: vzeroupper
319 ; AVX512DQ-FCP-NEXT: retq
321 ; AVX512BW-LABEL: store_i16_stride2_vf8:
323 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
324 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
325 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
326 ; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
327 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
328 ; AVX512BW-NEXT: vzeroupper
329 ; AVX512BW-NEXT: retq
331 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf8:
332 ; AVX512BW-FCP: # %bb.0:
333 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
334 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
335 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
336 ; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
337 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
338 ; AVX512BW-FCP-NEXT: vzeroupper
339 ; AVX512BW-FCP-NEXT: retq
341 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf8:
342 ; AVX512DQ-BW: # %bb.0:
343 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
344 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
345 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
346 ; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0
347 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx)
348 ; AVX512DQ-BW-NEXT: vzeroupper
349 ; AVX512DQ-BW-NEXT: retq
351 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf8:
352 ; AVX512DQ-BW-FCP: # %bb.0:
353 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
354 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
355 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15]
356 ; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0
357 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
358 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
359 ; AVX512DQ-BW-FCP-NEXT: retq
360 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64
361 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64
362 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
363 %interleaved.vec = shufflevector <16 x i16> %1, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
364 store <16 x i16> %interleaved.vec, ptr %out.vec, align 64
368 define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
369 ; SSE-LABEL: store_i16_stride2_vf16:
371 ; SSE-NEXT: movdqa (%rdi), %xmm0
372 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
373 ; SSE-NEXT: movdqa (%rsi), %xmm2
374 ; SSE-NEXT: movdqa 16(%rsi), %xmm3
375 ; SSE-NEXT: movdqa %xmm0, %xmm4
376 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
377 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
378 ; SSE-NEXT: movdqa %xmm1, %xmm2
379 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
380 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
381 ; SSE-NEXT: movdqa %xmm1, 32(%rdx)
382 ; SSE-NEXT: movdqa %xmm2, 48(%rdx)
383 ; SSE-NEXT: movdqa %xmm0, (%rdx)
384 ; SSE-NEXT: movdqa %xmm4, 16(%rdx)
387 ; AVX-LABEL: store_i16_stride2_vf16:
389 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
390 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
391 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
392 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
393 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
394 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
395 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
396 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
397 ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
398 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
399 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
400 ; AVX-NEXT: vmovdqa %xmm4, 16(%rdx)
403 ; AVX2-LABEL: store_i16_stride2_vf16:
405 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
406 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
407 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
408 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
409 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
410 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
411 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
412 ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
413 ; AVX2-NEXT: vzeroupper
416 ; AVX2-FP-LABEL: store_i16_stride2_vf16:
418 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
419 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1
420 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
421 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
422 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
423 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
424 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx)
425 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx)
426 ; AVX2-FP-NEXT: vzeroupper
429 ; AVX2-FCP-LABEL: store_i16_stride2_vf16:
431 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
432 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1
433 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
434 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
435 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
436 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
437 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx)
438 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx)
439 ; AVX2-FCP-NEXT: vzeroupper
440 ; AVX2-FCP-NEXT: retq
442 ; AVX512-LABEL: store_i16_stride2_vf16:
444 ; AVX512-NEXT: vmovdqa (%rsi), %xmm0
445 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
446 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2
447 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
448 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
449 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
450 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
451 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
452 ; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
453 ; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx)
454 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
455 ; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx)
458 ; AVX512-FCP-LABEL: store_i16_stride2_vf16:
459 ; AVX512-FCP: # %bb.0:
460 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
461 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
462 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
463 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
464 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
465 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
466 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
467 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
468 ; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
469 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
470 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
471 ; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
472 ; AVX512-FCP-NEXT: retq
474 ; AVX512DQ-LABEL: store_i16_stride2_vf16:
476 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
477 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
478 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
479 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
480 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
481 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
482 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
483 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
484 ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
485 ; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx)
486 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
487 ; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx)
488 ; AVX512DQ-NEXT: retq
490 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf16:
491 ; AVX512DQ-FCP: # %bb.0:
492 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
493 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
494 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
495 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
496 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
497 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
498 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
499 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
500 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
501 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
502 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
503 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
504 ; AVX512DQ-FCP-NEXT: retq
506 ; AVX512BW-LABEL: store_i16_stride2_vf16:
508 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
509 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
510 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
511 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
512 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
513 ; AVX512BW-NEXT: vzeroupper
514 ; AVX512BW-NEXT: retq
516 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf16:
517 ; AVX512BW-FCP: # %bb.0:
518 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
519 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
520 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
521 ; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
522 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
523 ; AVX512BW-FCP-NEXT: vzeroupper
524 ; AVX512BW-FCP-NEXT: retq
526 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf16:
527 ; AVX512DQ-BW: # %bb.0:
528 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
529 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
530 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
531 ; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
532 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
533 ; AVX512DQ-BW-NEXT: vzeroupper
534 ; AVX512DQ-BW-NEXT: retq
536 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf16:
537 ; AVX512DQ-BW-FCP: # %bb.0:
538 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
539 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
540 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31]
541 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
542 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
543 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
544 ; AVX512DQ-BW-FCP-NEXT: retq
545 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64
546 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64
547 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
548 %interleaved.vec = shufflevector <32 x i16> %1, <32 x i16> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
549 store <32 x i16> %interleaved.vec, ptr %out.vec, align 64
553 define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
554 ; SSE-LABEL: store_i16_stride2_vf32:
556 ; SSE-NEXT: movdqa (%rdi), %xmm0
557 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
558 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
559 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
560 ; SSE-NEXT: movdqa (%rsi), %xmm4
561 ; SSE-NEXT: movdqa 16(%rsi), %xmm5
562 ; SSE-NEXT: movdqa 32(%rsi), %xmm6
563 ; SSE-NEXT: movdqa 48(%rsi), %xmm7
564 ; SSE-NEXT: movdqa %xmm0, %xmm8
565 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
566 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
567 ; SSE-NEXT: movdqa %xmm1, %xmm4
568 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
569 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
570 ; SSE-NEXT: movdqa %xmm2, %xmm5
571 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
572 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
573 ; SSE-NEXT: movdqa %xmm3, %xmm6
574 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
575 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
576 ; SSE-NEXT: movdqa %xmm3, 96(%rdx)
577 ; SSE-NEXT: movdqa %xmm6, 112(%rdx)
578 ; SSE-NEXT: movdqa %xmm2, 64(%rdx)
579 ; SSE-NEXT: movdqa %xmm5, 80(%rdx)
580 ; SSE-NEXT: movdqa %xmm1, 32(%rdx)
581 ; SSE-NEXT: movdqa %xmm4, 48(%rdx)
582 ; SSE-NEXT: movdqa %xmm0, (%rdx)
583 ; SSE-NEXT: movdqa %xmm8, 16(%rdx)
586 ; AVX-LABEL: store_i16_stride2_vf32:
588 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
589 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
590 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm2
591 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
592 ; AVX-NEXT: vmovdqa (%rdi), %xmm4
593 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm5
594 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
595 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
596 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
597 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
598 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
599 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
600 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
601 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
602 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
603 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
604 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
605 ; AVX-NEXT: vmovdqa %xmm5, 16(%rdx)
606 ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
607 ; AVX-NEXT: vmovdqa %xmm7, 48(%rdx)
608 ; AVX-NEXT: vmovdqa %xmm3, 96(%rdx)
609 ; AVX-NEXT: vmovdqa %xmm6, 112(%rdx)
610 ; AVX-NEXT: vmovdqa %xmm2, 64(%rdx)
611 ; AVX-NEXT: vmovdqa %xmm8, 80(%rdx)
614 ; AVX2-LABEL: store_i16_stride2_vf32:
616 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
617 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
618 ; AVX2-NEXT: vmovdqa (%rsi), %ymm2
619 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3
620 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
621 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
622 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
623 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
624 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
625 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
626 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
627 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
628 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx)
629 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx)
630 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
631 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx)
632 ; AVX2-NEXT: vzeroupper
635 ; AVX2-FP-LABEL: store_i16_stride2_vf32:
637 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
638 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
639 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2
640 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3
641 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
642 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
643 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
644 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
645 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
646 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
647 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
648 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
649 ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx)
650 ; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rdx)
651 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
652 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx)
653 ; AVX2-FP-NEXT: vzeroupper
656 ; AVX2-FCP-LABEL: store_i16_stride2_vf32:
658 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
659 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
660 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2
661 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
662 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
663 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
664 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
665 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
666 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
667 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
668 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
669 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
670 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx)
671 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rdx)
672 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
673 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx)
674 ; AVX2-FCP-NEXT: vzeroupper
675 ; AVX2-FCP-NEXT: retq
677 ; AVX512-LABEL: store_i16_stride2_vf32:
679 ; AVX512-NEXT: vmovdqa (%rsi), %xmm0
680 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
681 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2
682 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm3
683 ; AVX512-NEXT: vmovdqa (%rdi), %xmm4
684 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
685 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
686 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7
687 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
688 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
689 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
690 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
691 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
692 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
693 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
694 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
695 ; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx)
696 ; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx)
697 ; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx)
698 ; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx)
699 ; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
700 ; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
701 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
702 ; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx)
705 ; AVX512-FCP-LABEL: store_i16_stride2_vf32:
706 ; AVX512-FCP: # %bb.0:
707 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
708 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
709 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
710 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
711 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
712 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
713 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
714 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
715 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
716 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
717 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
718 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
719 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
720 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
721 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
722 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
723 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
724 ; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
725 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
726 ; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
727 ; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
728 ; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
729 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
730 ; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
731 ; AVX512-FCP-NEXT: retq
733 ; AVX512DQ-LABEL: store_i16_stride2_vf32:
735 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
736 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
737 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2
738 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm3
739 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4
740 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
741 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
742 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7
743 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
744 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
745 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
746 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
747 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
748 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
749 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
750 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
751 ; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx)
752 ; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx)
753 ; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx)
754 ; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx)
755 ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
756 ; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
757 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
758 ; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx)
759 ; AVX512DQ-NEXT: retq
761 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf32:
762 ; AVX512DQ-FCP: # %bb.0:
763 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
764 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
765 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
766 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
767 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
768 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
769 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
770 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
771 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
772 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
773 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
774 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
775 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
776 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
777 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
778 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
779 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
780 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
781 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
782 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
783 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
784 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
785 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
786 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
787 ; AVX512DQ-FCP-NEXT: retq
789 ; AVX512BW-LABEL: store_i16_stride2_vf32:
791 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
792 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1
793 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
794 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
795 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
796 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
797 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
798 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
799 ; AVX512BW-NEXT: vzeroupper
800 ; AVX512BW-NEXT: retq
802 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf32:
803 ; AVX512BW-FCP: # %bb.0:
804 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
805 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
806 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
807 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
808 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
809 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
810 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
811 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
812 ; AVX512BW-FCP-NEXT: vzeroupper
813 ; AVX512BW-FCP-NEXT: retq
815 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf32:
816 ; AVX512DQ-BW: # %bb.0:
817 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
818 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1
819 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
820 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
821 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
822 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
823 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx)
824 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx)
825 ; AVX512DQ-BW-NEXT: vzeroupper
826 ; AVX512DQ-BW-NEXT: retq
828 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf32:
829 ; AVX512DQ-BW-FCP: # %bb.0:
830 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
831 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1
832 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
833 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
834 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
835 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
836 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx)
837 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
838 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
839 ; AVX512DQ-BW-FCP-NEXT: retq
840 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64
841 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64
842 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
843 %interleaved.vec = shufflevector <64 x i16> %1, <64 x i16> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
844 store <64 x i16> %interleaved.vec, ptr %out.vec, align 64
848 define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
849 ; SSE-LABEL: store_i16_stride2_vf64:
851 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
852 ; SSE-NEXT: movdqa 96(%rdi), %xmm6
853 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
854 ; SSE-NEXT: movdqa 64(%rdi), %xmm3
855 ; SSE-NEXT: movdqa (%rdi), %xmm8
856 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
857 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
858 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
859 ; SSE-NEXT: movdqa 96(%rsi), %xmm11
860 ; SSE-NEXT: movdqa 80(%rsi), %xmm12
861 ; SSE-NEXT: movdqa 64(%rsi), %xmm13
862 ; SSE-NEXT: movdqa (%rsi), %xmm9
863 ; SSE-NEXT: movdqa 16(%rsi), %xmm10
864 ; SSE-NEXT: movdqa 32(%rsi), %xmm14
865 ; SSE-NEXT: movdqa 48(%rsi), %xmm15
866 ; SSE-NEXT: movdqa %xmm8, %xmm7
867 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
868 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
869 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
870 ; SSE-NEXT: movdqa %xmm1, %xmm9
871 ; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
872 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
873 ; SSE-NEXT: movdqa %xmm2, %xmm10
874 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
875 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
876 ; SSE-NEXT: movdqa %xmm5, %xmm14
877 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
878 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
879 ; SSE-NEXT: movdqa %xmm3, %xmm15
880 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
881 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
882 ; SSE-NEXT: movdqa %xmm4, %xmm13
883 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
884 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
885 ; SSE-NEXT: movdqa %xmm6, %xmm12
886 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
887 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
888 ; SSE-NEXT: movdqa 112(%rsi), %xmm11
889 ; SSE-NEXT: movdqa %xmm0, %xmm7
890 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
891 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
892 ; SSE-NEXT: movdqa %xmm0, 224(%rdx)
893 ; SSE-NEXT: movdqa %xmm7, 240(%rdx)
894 ; SSE-NEXT: movdqa %xmm6, 192(%rdx)
895 ; SSE-NEXT: movdqa %xmm12, 208(%rdx)
896 ; SSE-NEXT: movdqa %xmm4, 160(%rdx)
897 ; SSE-NEXT: movdqa %xmm13, 176(%rdx)
898 ; SSE-NEXT: movdqa %xmm3, 128(%rdx)
899 ; SSE-NEXT: movdqa %xmm15, 144(%rdx)
900 ; SSE-NEXT: movdqa %xmm5, 96(%rdx)
901 ; SSE-NEXT: movdqa %xmm14, 112(%rdx)
902 ; SSE-NEXT: movdqa %xmm2, 64(%rdx)
903 ; SSE-NEXT: movdqa %xmm10, 80(%rdx)
904 ; SSE-NEXT: movdqa %xmm1, 32(%rdx)
905 ; SSE-NEXT: movdqa %xmm9, 48(%rdx)
906 ; SSE-NEXT: movdqa %xmm8, (%rdx)
907 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
908 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
911 ; AVX-LABEL: store_i16_stride2_vf64:
913 ; AVX-NEXT: vmovdqa 64(%rsi), %xmm1
914 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
915 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
916 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
917 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
918 ; AVX-NEXT: vmovdqa 80(%rsi), %xmm3
919 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
920 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
921 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
922 ; AVX-NEXT: vmovdqa (%rsi), %xmm4
923 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm5
924 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm6
925 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm7
926 ; AVX-NEXT: vmovdqa (%rdi), %xmm8
927 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
928 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm10
929 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm11
930 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
931 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
932 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
933 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
934 ; AVX-NEXT: vmovdqa 96(%rsi), %xmm10
935 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm13
936 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
937 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
938 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
939 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
940 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
941 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
942 ; AVX-NEXT: vmovdqa 112(%rsi), %xmm9
943 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm15
944 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
945 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
946 ; AVX-NEXT: vmovdqa %xmm9, 224(%rdx)
947 ; AVX-NEXT: vmovdqa %xmm0, 240(%rdx)
948 ; AVX-NEXT: vmovdqa %xmm5, 32(%rdx)
949 ; AVX-NEXT: vmovdqa %xmm11, 48(%rdx)
950 ; AVX-NEXT: vmovdqa %xmm7, 96(%rdx)
951 ; AVX-NEXT: vmovdqa %xmm13, 112(%rdx)
952 ; AVX-NEXT: vmovdqa %xmm10, 192(%rdx)
953 ; AVX-NEXT: vmovdqa %xmm14, 208(%rdx)
954 ; AVX-NEXT: vmovdqa %xmm6, 64(%rdx)
955 ; AVX-NEXT: vmovdqa %xmm8, 80(%rdx)
956 ; AVX-NEXT: vmovdqa %xmm4, (%rdx)
957 ; AVX-NEXT: vmovdqa %xmm12, 16(%rdx)
958 ; AVX-NEXT: vmovdqa %xmm3, 160(%rdx)
959 ; AVX-NEXT: vmovdqa %xmm2, 176(%rdx)
960 ; AVX-NEXT: vmovdqa %xmm1, 128(%rdx)
961 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
962 ; AVX-NEXT: vmovaps %xmm0, 144(%rdx)
965 ; AVX2-LABEL: store_i16_stride2_vf64:
967 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
968 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
969 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
970 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
971 ; AVX2-NEXT: vmovdqa (%rsi), %ymm4
972 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm5
973 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm6
974 ; AVX2-NEXT: vmovdqa 96(%rsi), %ymm7
975 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
976 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
977 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
978 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
979 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
980 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
981 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
982 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
983 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
984 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
985 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
986 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
987 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
988 ; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
989 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
990 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
991 ; AVX2-NEXT: vmovdqa %ymm3, 192(%rdx)
992 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rdx)
993 ; AVX2-NEXT: vmovdqa %ymm2, 128(%rdx)
994 ; AVX2-NEXT: vmovdqa %ymm6, 160(%rdx)
995 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx)
996 ; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx)
997 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
998 ; AVX2-NEXT: vmovdqa %ymm4, 32(%rdx)
999 ; AVX2-NEXT: vzeroupper
1002 ; AVX2-FP-LABEL: store_i16_stride2_vf64:
1004 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
1005 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
1006 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2
1007 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3
1008 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4
1009 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm5
1010 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm6
1011 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm7
1012 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1013 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1014 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
1015 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
1016 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1017 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1018 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
1019 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
1020 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
1021 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
1022 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
1023 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
1024 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
1025 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
1026 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
1027 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
1028 ; AVX2-FP-NEXT: vmovdqa %ymm3, 192(%rdx)
1029 ; AVX2-FP-NEXT: vmovdqa %ymm7, 224(%rdx)
1030 ; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rdx)
1031 ; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rdx)
1032 ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx)
1033 ; AVX2-FP-NEXT: vmovdqa %ymm5, 96(%rdx)
1034 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
1035 ; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rdx)
1036 ; AVX2-FP-NEXT: vzeroupper
1037 ; AVX2-FP-NEXT: retq
1039 ; AVX2-FCP-LABEL: store_i16_stride2_vf64:
1040 ; AVX2-FCP: # %bb.0:
1041 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
1042 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1043 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
1044 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3
1045 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4
1046 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
1047 ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm6
1048 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm7
1049 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15]
1050 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
1051 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3]
1052 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1]
1053 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15]
1054 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11]
1055 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3]
1056 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1]
1057 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15]
1058 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11]
1059 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3]
1060 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1]
1061 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15]
1062 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11]
1063 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3]
1064 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1]
1065 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 192(%rdx)
1066 ; AVX2-FCP-NEXT: vmovdqa %ymm7, 224(%rdx)
1067 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rdx)
1068 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 160(%rdx)
1069 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx)
1070 ; AVX2-FCP-NEXT: vmovdqa %ymm5, 96(%rdx)
1071 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
1072 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rdx)
1073 ; AVX2-FCP-NEXT: vzeroupper
1074 ; AVX2-FCP-NEXT: retq
1076 ; AVX512-LABEL: store_i16_stride2_vf64:
1078 ; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1
1079 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
1080 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1081 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
1082 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1083 ; AVX512-NEXT: vmovdqa 80(%rsi), %xmm3
1084 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
1085 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1086 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1087 ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm5
1088 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm6
1089 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1090 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1091 ; AVX512-NEXT: vmovdqa 112(%rsi), %xmm6
1092 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7
1093 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1094 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1095 ; AVX512-NEXT: vmovdqa (%rsi), %xmm7
1096 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm9
1097 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
1098 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11
1099 ; AVX512-NEXT: vmovdqa (%rdi), %xmm12
1100 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13
1101 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm14
1102 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1103 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1104 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1105 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1106 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1107 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1108 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
1109 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1110 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1111 ; AVX512-NEXT: vmovdqa %xmm9, 48(%rdx)
1112 ; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx)
1113 ; AVX512-NEXT: vmovdqa %xmm7, 16(%rdx)
1114 ; AVX512-NEXT: vmovdqa %xmm14, (%rdx)
1115 ; AVX512-NEXT: vmovdqa %xmm11, 112(%rdx)
1116 ; AVX512-NEXT: vmovdqa %xmm13, 96(%rdx)
1117 ; AVX512-NEXT: vmovdqa %xmm10, 80(%rdx)
1118 ; AVX512-NEXT: vmovdqa %xmm15, 64(%rdx)
1119 ; AVX512-NEXT: vmovdqa %xmm6, 240(%rdx)
1120 ; AVX512-NEXT: vmovdqa %xmm8, 224(%rdx)
1121 ; AVX512-NEXT: vmovdqa %xmm5, 208(%rdx)
1122 ; AVX512-NEXT: vmovdqa %xmm4, 192(%rdx)
1123 ; AVX512-NEXT: vmovdqa %xmm3, 176(%rdx)
1124 ; AVX512-NEXT: vmovdqa %xmm2, 160(%rdx)
1125 ; AVX512-NEXT: vmovdqa %xmm1, 144(%rdx)
1126 ; AVX512-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1129 ; AVX512-FCP-LABEL: store_i16_stride2_vf64:
1130 ; AVX512-FCP: # %bb.0:
1131 ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
1132 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
1133 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1134 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1135 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1136 ; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
1137 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1138 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1139 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1140 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
1141 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
1142 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1143 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1144 ; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
1145 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
1146 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1147 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1148 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7
1149 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
1150 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
1151 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
1152 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12
1153 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
1154 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
1155 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1156 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1157 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1158 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1159 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1160 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1161 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
1162 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1163 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1164 ; AVX512-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
1165 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
1166 ; AVX512-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
1167 ; AVX512-FCP-NEXT: vmovdqa %xmm14, (%rdx)
1168 ; AVX512-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
1169 ; AVX512-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
1170 ; AVX512-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
1171 ; AVX512-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
1172 ; AVX512-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
1173 ; AVX512-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
1174 ; AVX512-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
1175 ; AVX512-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
1176 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
1177 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
1178 ; AVX512-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
1179 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1180 ; AVX512-FCP-NEXT: retq
1182 ; AVX512DQ-LABEL: store_i16_stride2_vf64:
1183 ; AVX512DQ: # %bb.0:
1184 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1
1185 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
1186 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1187 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
1188 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1189 ; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm3
1190 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4
1191 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1192 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1193 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm5
1194 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm6
1195 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1196 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1197 ; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm6
1198 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7
1199 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1200 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1201 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7
1202 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm9
1203 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
1204 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11
1205 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12
1206 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13
1207 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm14
1208 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1209 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1210 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1211 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1212 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1213 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1214 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
1215 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1216 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1217 ; AVX512DQ-NEXT: vmovdqa %xmm9, 48(%rdx)
1218 ; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rdx)
1219 ; AVX512DQ-NEXT: vmovdqa %xmm7, 16(%rdx)
1220 ; AVX512DQ-NEXT: vmovdqa %xmm14, (%rdx)
1221 ; AVX512DQ-NEXT: vmovdqa %xmm11, 112(%rdx)
1222 ; AVX512DQ-NEXT: vmovdqa %xmm13, 96(%rdx)
1223 ; AVX512DQ-NEXT: vmovdqa %xmm10, 80(%rdx)
1224 ; AVX512DQ-NEXT: vmovdqa %xmm15, 64(%rdx)
1225 ; AVX512DQ-NEXT: vmovdqa %xmm6, 240(%rdx)
1226 ; AVX512DQ-NEXT: vmovdqa %xmm8, 224(%rdx)
1227 ; AVX512DQ-NEXT: vmovdqa %xmm5, 208(%rdx)
1228 ; AVX512DQ-NEXT: vmovdqa %xmm4, 192(%rdx)
1229 ; AVX512DQ-NEXT: vmovdqa %xmm3, 176(%rdx)
1230 ; AVX512DQ-NEXT: vmovdqa %xmm2, 160(%rdx)
1231 ; AVX512DQ-NEXT: vmovdqa %xmm1, 144(%rdx)
1232 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1233 ; AVX512DQ-NEXT: retq
1235 ; AVX512DQ-FCP-LABEL: store_i16_stride2_vf64:
1236 ; AVX512DQ-FCP: # %bb.0:
1237 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1
1238 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
1239 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1240 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1241 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1242 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm3
1243 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
1244 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1245 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1246 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
1247 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
1248 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1249 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1250 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm6
1251 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm7
1252 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1253 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1254 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7
1255 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9
1256 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
1257 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
1258 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12
1259 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
1260 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm14
1261 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
1262 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
1263 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
1264 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7]
1265 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
1266 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7]
1267 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12
1268 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1269 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
1270 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 48(%rdx)
1271 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rdx)
1272 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 16(%rdx)
1273 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%rdx)
1274 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, 112(%rdx)
1275 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 96(%rdx)
1276 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 80(%rdx)
1277 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 64(%rdx)
1278 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 240(%rdx)
1279 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 224(%rdx)
1280 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 208(%rdx)
1281 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 192(%rdx)
1282 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 176(%rdx)
1283 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 160(%rdx)
1284 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 144(%rdx)
1285 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx)
1286 ; AVX512DQ-FCP-NEXT: retq
1288 ; AVX512BW-LABEL: store_i16_stride2_vf64:
1289 ; AVX512BW: # %bb.0:
1290 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1291 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1292 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2
1293 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1294 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
1295 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5
1296 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
1297 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
1298 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1299 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1300 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1301 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1302 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1303 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1304 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1305 ; AVX512BW-NEXT: vzeroupper
1306 ; AVX512BW-NEXT: retq
1308 ; AVX512BW-FCP-LABEL: store_i16_stride2_vf64:
1309 ; AVX512BW-FCP: # %bb.0:
1310 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1311 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1312 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1313 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1314 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
1315 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
1316 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
1317 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
1318 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1319 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1320 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1321 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1322 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1323 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1324 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1325 ; AVX512BW-FCP-NEXT: vzeroupper
1326 ; AVX512BW-FCP-NEXT: retq
1328 ; AVX512DQ-BW-LABEL: store_i16_stride2_vf64:
1329 ; AVX512DQ-BW: # %bb.0:
1330 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1331 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1332 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2
1333 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3
1334 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
1335 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5
1336 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
1337 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
1338 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1339 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1340 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1341 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1342 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1343 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
1344 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1345 ; AVX512DQ-BW-NEXT: vzeroupper
1346 ; AVX512DQ-BW-NEXT: retq
1348 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf64:
1349 ; AVX512DQ-BW-FCP: # %bb.0:
1350 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1351 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1352 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2
1353 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3
1354 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63]
1355 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
1356 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5
1357 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
1358 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0
1359 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4
1360 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1
1361 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx)
1362 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx)
1363 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1364 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
1365 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1366 ; AVX512DQ-BW-FCP-NEXT: retq
1367 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64
1368 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64
1369 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1370 %interleaved.vec = shufflevector <128 x i16> %1, <128 x i16> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
1371 store <128 x i16> %interleaved.vec, ptr %out.vec, align 64