1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i8_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i8_stride2_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
23 ; SSE-NEXT: movd %xmm0, (%rdx)
26 ; AVX-LABEL: store_i8_stride2_vf2:
28 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
29 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
30 ; AVX-NEXT: vmovd %xmm0, (%rdx)
33 ; AVX2-LABEL: store_i8_stride2_vf2:
35 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
36 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
37 ; AVX2-NEXT: vmovd %xmm0, (%rdx)
40 ; AVX2-FP-LABEL: store_i8_stride2_vf2:
42 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
43 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
44 ; AVX2-FP-NEXT: vmovd %xmm0, (%rdx)
47 ; AVX2-FCP-LABEL: store_i8_stride2_vf2:
49 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
50 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
51 ; AVX2-FCP-NEXT: vmovd %xmm0, (%rdx)
54 ; AVX512-LABEL: store_i8_stride2_vf2:
56 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
58 ; AVX512-NEXT: vmovd %xmm0, (%rdx)
61 ; AVX512-FCP-LABEL: store_i8_stride2_vf2:
62 ; AVX512-FCP: # %bb.0:
63 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
64 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
65 ; AVX512-FCP-NEXT: vmovd %xmm0, (%rdx)
66 ; AVX512-FCP-NEXT: retq
68 ; AVX512DQ-LABEL: store_i8_stride2_vf2:
70 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
71 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
72 ; AVX512DQ-NEXT: vmovd %xmm0, (%rdx)
75 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf2:
76 ; AVX512DQ-FCP: # %bb.0:
77 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
78 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
79 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rdx)
80 ; AVX512DQ-FCP-NEXT: retq
82 ; AVX512BW-LABEL: store_i8_stride2_vf2:
84 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
85 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
86 ; AVX512BW-NEXT: vmovd %xmm0, (%rdx)
89 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf2:
90 ; AVX512BW-FCP: # %bb.0:
91 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
92 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
93 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rdx)
94 ; AVX512BW-FCP-NEXT: retq
96 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf2:
97 ; AVX512DQ-BW: # %bb.0:
98 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
99 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
100 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rdx)
101 ; AVX512DQ-BW-NEXT: retq
103 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf2:
104 ; AVX512DQ-BW-FCP: # %bb.0:
105 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
106 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
107 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rdx)
108 ; AVX512DQ-BW-FCP-NEXT: retq
109 %in.vec0 = load <2 x i8>, ptr %in.vecptr0, align 64
110 %in.vec1 = load <2 x i8>, ptr %in.vecptr1, align 64
111 %1 = shufflevector <2 x i8> %in.vec0, <2 x i8> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112 %interleaved.vec = shufflevector <4 x i8> %1, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
113 store <4 x i8> %interleaved.vec, ptr %out.vec, align 64
117 define void @store_i8_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
118 ; SSE-LABEL: store_i8_stride2_vf4:
120 ; SSE-NEXT: movdqa (%rdi), %xmm0
121 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
122 ; SSE-NEXT: movq %xmm0, (%rdx)
125 ; AVX-LABEL: store_i8_stride2_vf4:
127 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
128 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
129 ; AVX-NEXT: vmovq %xmm0, (%rdx)
132 ; AVX2-LABEL: store_i8_stride2_vf4:
134 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
135 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
136 ; AVX2-NEXT: vmovq %xmm0, (%rdx)
139 ; AVX2-FP-LABEL: store_i8_stride2_vf4:
141 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
142 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
143 ; AVX2-FP-NEXT: vmovq %xmm0, (%rdx)
146 ; AVX2-FCP-LABEL: store_i8_stride2_vf4:
148 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
149 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
150 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx)
151 ; AVX2-FCP-NEXT: retq
153 ; AVX512-LABEL: store_i8_stride2_vf4:
155 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
156 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
157 ; AVX512-NEXT: vmovq %xmm0, (%rdx)
160 ; AVX512-FCP-LABEL: store_i8_stride2_vf4:
161 ; AVX512-FCP: # %bb.0:
162 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
163 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
164 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
165 ; AVX512-FCP-NEXT: retq
167 ; AVX512DQ-LABEL: store_i8_stride2_vf4:
169 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
170 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
171 ; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
172 ; AVX512DQ-NEXT: retq
174 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf4:
175 ; AVX512DQ-FCP: # %bb.0:
176 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
177 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
178 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
179 ; AVX512DQ-FCP-NEXT: retq
181 ; AVX512BW-LABEL: store_i8_stride2_vf4:
183 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
185 ; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
186 ; AVX512BW-NEXT: retq
188 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf4:
189 ; AVX512BW-FCP: # %bb.0:
190 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
191 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
192 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
193 ; AVX512BW-FCP-NEXT: retq
195 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf4:
196 ; AVX512DQ-BW: # %bb.0:
197 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
198 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
199 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
200 ; AVX512DQ-BW-NEXT: retq
202 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf4:
203 ; AVX512DQ-BW-FCP: # %bb.0:
204 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
205 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
206 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
207 ; AVX512DQ-BW-FCP-NEXT: retq
208 %in.vec0 = load <4 x i8>, ptr %in.vecptr0, align 64
209 %in.vec1 = load <4 x i8>, ptr %in.vecptr1, align 64
210 %1 = shufflevector <4 x i8> %in.vec0, <4 x i8> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
211 %interleaved.vec = shufflevector <8 x i8> %1, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
212 store <8 x i8> %interleaved.vec, ptr %out.vec, align 64
216 define void @store_i8_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
217 ; SSE-LABEL: store_i8_stride2_vf8:
219 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
220 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
221 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
222 ; SSE-NEXT: movdqa %xmm1, (%rdx)
225 ; AVX-LABEL: store_i8_stride2_vf8:
227 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
228 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
229 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
230 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
233 ; AVX2-LABEL: store_i8_stride2_vf8:
235 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
236 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
237 ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
238 ; AVX2-NEXT: vmovdqa %xmm0, (%rdx)
241 ; AVX2-FP-LABEL: store_i8_stride2_vf8:
243 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
244 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
245 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
246 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx)
249 ; AVX2-FCP-LABEL: store_i8_stride2_vf8:
251 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
252 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
253 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
254 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx)
255 ; AVX2-FCP-NEXT: retq
257 ; AVX512-LABEL: store_i8_stride2_vf8:
259 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
260 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
261 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
262 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
265 ; AVX512-FCP-LABEL: store_i8_stride2_vf8:
266 ; AVX512-FCP: # %bb.0:
267 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
268 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
269 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
270 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
271 ; AVX512-FCP-NEXT: retq
273 ; AVX512DQ-LABEL: store_i8_stride2_vf8:
275 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
276 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
277 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
278 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
279 ; AVX512DQ-NEXT: retq
281 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf8:
282 ; AVX512DQ-FCP: # %bb.0:
283 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
284 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
285 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
286 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
287 ; AVX512DQ-FCP-NEXT: retq
289 ; AVX512BW-LABEL: store_i8_stride2_vf8:
291 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
292 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
293 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
294 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
295 ; AVX512BW-NEXT: retq
297 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf8:
298 ; AVX512BW-FCP: # %bb.0:
299 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
300 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
301 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
302 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
303 ; AVX512BW-FCP-NEXT: retq
305 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf8:
306 ; AVX512DQ-BW: # %bb.0:
307 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
308 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
309 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
310 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
311 ; AVX512DQ-BW-NEXT: retq
313 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf8:
314 ; AVX512DQ-BW-FCP: # %bb.0:
315 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
316 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
317 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
318 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
319 ; AVX512DQ-BW-FCP-NEXT: retq
320 %in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
321 %in.vec1 = load <8 x i8>, ptr %in.vecptr1, align 64
322 %1 = shufflevector <8 x i8> %in.vec0, <8 x i8> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
323 %interleaved.vec = shufflevector <16 x i8> %1, <16 x i8> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
324 store <16 x i8> %interleaved.vec, ptr %out.vec, align 64
328 define void @store_i8_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
329 ; SSE-LABEL: store_i8_stride2_vf16:
331 ; SSE-NEXT: movdqa (%rdi), %xmm0
332 ; SSE-NEXT: movdqa (%rsi), %xmm1
333 ; SSE-NEXT: movdqa %xmm0, %xmm2
334 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
335 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
336 ; SSE-NEXT: movdqa %xmm0, 16(%rdx)
337 ; SSE-NEXT: movdqa %xmm2, (%rdx)
340 ; AVX-LABEL: store_i8_stride2_vf16:
342 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
343 ; AVX-NEXT: vmovdqa (%rsi), %xmm1
344 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
345 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
346 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx)
347 ; AVX-NEXT: vmovdqa %xmm2, (%rdx)
350 ; AVX2-LABEL: store_i8_stride2_vf16:
352 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
353 ; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
354 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
355 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
356 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
357 ; AVX2-NEXT: vzeroupper
360 ; AVX2-FP-LABEL: store_i8_stride2_vf16:
362 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
363 ; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
364 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
365 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
366 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
367 ; AVX2-FP-NEXT: vzeroupper
370 ; AVX2-FCP-LABEL: store_i8_stride2_vf16:
372 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
373 ; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
374 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
375 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
376 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
377 ; AVX2-FCP-NEXT: vzeroupper
378 ; AVX2-FCP-NEXT: retq
380 ; AVX512-LABEL: store_i8_stride2_vf16:
382 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
383 ; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
384 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
385 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
386 ; AVX512-NEXT: vmovdqa %ymm0, (%rdx)
387 ; AVX512-NEXT: vzeroupper
390 ; AVX512-FCP-LABEL: store_i8_stride2_vf16:
391 ; AVX512-FCP: # %bb.0:
392 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
393 ; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
394 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
395 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
396 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx)
397 ; AVX512-FCP-NEXT: vzeroupper
398 ; AVX512-FCP-NEXT: retq
400 ; AVX512DQ-LABEL: store_i8_stride2_vf16:
402 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
403 ; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
404 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
405 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
406 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx)
407 ; AVX512DQ-NEXT: vzeroupper
408 ; AVX512DQ-NEXT: retq
410 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf16:
411 ; AVX512DQ-FCP: # %bb.0:
412 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
413 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
414 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
415 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
416 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx)
417 ; AVX512DQ-FCP-NEXT: vzeroupper
418 ; AVX512DQ-FCP-NEXT: retq
420 ; AVX512BW-LABEL: store_i8_stride2_vf16:
422 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
423 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
424 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
425 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
426 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
427 ; AVX512BW-NEXT: vzeroupper
428 ; AVX512BW-NEXT: retq
430 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf16:
431 ; AVX512BW-FCP: # %bb.0:
432 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
433 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
434 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
435 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
436 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
437 ; AVX512BW-FCP-NEXT: vzeroupper
438 ; AVX512BW-FCP-NEXT: retq
440 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf16:
441 ; AVX512DQ-BW: # %bb.0:
442 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
443 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
444 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
445 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
446 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx)
447 ; AVX512DQ-BW-NEXT: vzeroupper
448 ; AVX512DQ-BW-NEXT: retq
450 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf16:
451 ; AVX512DQ-BW-FCP: # %bb.0:
452 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
453 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
454 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
455 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31]
456 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx)
457 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
458 ; AVX512DQ-BW-FCP-NEXT: retq
459 %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64
460 %in.vec1 = load <16 x i8>, ptr %in.vecptr1, align 64
461 %1 = shufflevector <16 x i8> %in.vec0, <16 x i8> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
462 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
463 store <32 x i8> %interleaved.vec, ptr %out.vec, align 64
467 define void @store_i8_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
468 ; SSE-LABEL: store_i8_stride2_vf32:
470 ; SSE-NEXT: movdqa (%rdi), %xmm0
471 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
472 ; SSE-NEXT: movdqa (%rsi), %xmm2
473 ; SSE-NEXT: movdqa 16(%rsi), %xmm3
474 ; SSE-NEXT: movdqa %xmm0, %xmm4
475 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
476 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
477 ; SSE-NEXT: movdqa %xmm1, %xmm2
478 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
479 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
480 ; SSE-NEXT: movdqa %xmm1, 32(%rdx)
481 ; SSE-NEXT: movdqa %xmm2, 48(%rdx)
482 ; SSE-NEXT: movdqa %xmm0, (%rdx)
483 ; SSE-NEXT: movdqa %xmm4, 16(%rdx)
486 ; AVX-LABEL: store_i8_stride2_vf32:
488 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
489 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
490 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
491 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
492 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
493 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
494 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
495 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
496 ; AVX-NEXT: vmovdqa %xmm1, 48(%rdx)
497 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx)
498 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
499 ; AVX-NEXT: vmovdqa %xmm4, 16(%rdx)
502 ; AVX2-LABEL: store_i8_stride2_vf32:
504 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
505 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
506 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
507 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
508 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
509 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
510 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
511 ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
512 ; AVX2-NEXT: vzeroupper
515 ; AVX2-FP-LABEL: store_i8_stride2_vf32:
517 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
518 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1
519 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
520 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
521 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
522 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
523 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx)
524 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx)
525 ; AVX2-FP-NEXT: vzeroupper
528 ; AVX2-FCP-LABEL: store_i8_stride2_vf32:
530 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
531 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1
532 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
533 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
534 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1]
535 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
536 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx)
537 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx)
538 ; AVX2-FCP-NEXT: vzeroupper
539 ; AVX2-FCP-NEXT: retq
541 ; AVX512-LABEL: store_i8_stride2_vf32:
543 ; AVX512-NEXT: vmovdqa (%rsi), %xmm0
544 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
545 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2
546 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
547 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
548 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
549 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
550 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
551 ; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
552 ; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx)
553 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
554 ; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx)
557 ; AVX512-FCP-LABEL: store_i8_stride2_vf32:
558 ; AVX512-FCP: # %bb.0:
559 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
560 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
561 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
562 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
563 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
564 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
565 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
566 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
567 ; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
568 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
569 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
570 ; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
571 ; AVX512-FCP-NEXT: retq
573 ; AVX512DQ-LABEL: store_i8_stride2_vf32:
575 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
576 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
577 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
578 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
579 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
580 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
581 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
582 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
583 ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
584 ; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx)
585 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
586 ; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx)
587 ; AVX512DQ-NEXT: retq
589 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf32:
590 ; AVX512DQ-FCP: # %bb.0:
591 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
592 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
593 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
594 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
595 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
596 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
597 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
598 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
599 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
600 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx)
601 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
602 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx)
603 ; AVX512DQ-FCP-NEXT: retq
605 ; AVX512BW-LABEL: store_i8_stride2_vf32:
607 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
608 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
609 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
610 ; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0
611 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
612 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
613 ; AVX512BW-NEXT: vzeroupper
614 ; AVX512BW-NEXT: retq
616 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf32:
617 ; AVX512BW-FCP: # %bb.0:
618 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
619 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
620 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
621 ; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0
622 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
623 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
624 ; AVX512BW-FCP-NEXT: vzeroupper
625 ; AVX512BW-FCP-NEXT: retq
627 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf32:
628 ; AVX512DQ-BW: # %bb.0:
629 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
630 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
631 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
632 ; AVX512DQ-BW-NEXT: vpermq %zmm0, %zmm1, %zmm0
633 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
634 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
635 ; AVX512DQ-BW-NEXT: vzeroupper
636 ; AVX512DQ-BW-NEXT: retq
638 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf32:
639 ; AVX512DQ-BW-FCP: # %bb.0:
640 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
641 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
642 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,4,1,5,2,6,3,7]
643 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm1, %zmm0
644 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15,16,24,17,25,18,26,19,27,20,28,21,29,22,30,23,31,32,40,33,41,34,42,35,43,36,44,37,45,38,46,39,47,48,56,49,57,50,58,51,59,52,60,53,61,54,62,55,63]
645 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
646 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
647 ; AVX512DQ-BW-FCP-NEXT: retq
648 %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64
649 %in.vec1 = load <32 x i8>, ptr %in.vecptr1, align 64
650 %1 = shufflevector <32 x i8> %in.vec0, <32 x i8> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
651 %interleaved.vec = shufflevector <64 x i8> %1, <64 x i8> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
652 store <64 x i8> %interleaved.vec, ptr %out.vec, align 64
656 define void @store_i8_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind {
657 ; SSE-LABEL: store_i8_stride2_vf64:
659 ; SSE-NEXT: movdqa (%rdi), %xmm0
660 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
661 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
662 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
663 ; SSE-NEXT: movdqa (%rsi), %xmm4
664 ; SSE-NEXT: movdqa 16(%rsi), %xmm5
665 ; SSE-NEXT: movdqa 32(%rsi), %xmm6
666 ; SSE-NEXT: movdqa 48(%rsi), %xmm7
667 ; SSE-NEXT: movdqa %xmm0, %xmm8
668 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
669 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
670 ; SSE-NEXT: movdqa %xmm1, %xmm4
671 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
672 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
673 ; SSE-NEXT: movdqa %xmm2, %xmm5
674 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
675 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
676 ; SSE-NEXT: movdqa %xmm3, %xmm6
677 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
678 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
679 ; SSE-NEXT: movdqa %xmm3, 96(%rdx)
680 ; SSE-NEXT: movdqa %xmm6, 112(%rdx)
681 ; SSE-NEXT: movdqa %xmm2, 64(%rdx)
682 ; SSE-NEXT: movdqa %xmm5, 80(%rdx)
683 ; SSE-NEXT: movdqa %xmm1, 32(%rdx)
684 ; SSE-NEXT: movdqa %xmm4, 48(%rdx)
685 ; SSE-NEXT: movdqa %xmm0, (%rdx)
686 ; SSE-NEXT: movdqa %xmm8, 16(%rdx)
689 ; AVX-LABEL: store_i8_stride2_vf64:
691 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
692 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm1
693 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm2
694 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
695 ; AVX-NEXT: vmovdqa (%rdi), %xmm4
696 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm5
697 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
698 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
699 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
700 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
701 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
702 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
703 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
704 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
705 ; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
706 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
707 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
708 ; AVX-NEXT: vmovdqa %xmm5, 16(%rdx)
709 ; AVX-NEXT: vmovdqa %xmm1, 32(%rdx)
710 ; AVX-NEXT: vmovdqa %xmm7, 48(%rdx)
711 ; AVX-NEXT: vmovdqa %xmm3, 96(%rdx)
712 ; AVX-NEXT: vmovdqa %xmm6, 112(%rdx)
713 ; AVX-NEXT: vmovdqa %xmm2, 64(%rdx)
714 ; AVX-NEXT: vmovdqa %xmm8, 80(%rdx)
717 ; AVX2-LABEL: store_i8_stride2_vf64:
719 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
720 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
721 ; AVX2-NEXT: vmovdqa (%rsi), %ymm2
722 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3
723 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
724 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
725 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
726 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
727 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
728 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
729 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
730 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
731 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx)
732 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx)
733 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
734 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx)
735 ; AVX2-NEXT: vzeroupper
738 ; AVX2-FP-LABEL: store_i8_stride2_vf64:
740 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
741 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
742 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2
743 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3
744 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
745 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
746 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
747 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
748 ; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
749 ; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
750 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
751 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
752 ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx)
753 ; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rdx)
754 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
755 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx)
756 ; AVX2-FP-NEXT: vzeroupper
759 ; AVX2-FCP-LABEL: store_i8_stride2_vf64:
761 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
762 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
763 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2
764 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
765 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
766 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
767 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3]
768 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1]
769 ; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
770 ; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
771 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3]
772 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1]
773 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx)
774 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rdx)
775 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
776 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx)
777 ; AVX2-FCP-NEXT: vzeroupper
778 ; AVX2-FCP-NEXT: retq
780 ; AVX512-LABEL: store_i8_stride2_vf64:
782 ; AVX512-NEXT: vmovdqa (%rsi), %xmm0
783 ; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1
784 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2
785 ; AVX512-NEXT: vmovdqa 48(%rsi), %xmm3
786 ; AVX512-NEXT: vmovdqa (%rdi), %xmm4
787 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
788 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
789 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7
790 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
791 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
792 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
793 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
794 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
795 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
796 ; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
797 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
798 ; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx)
799 ; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx)
800 ; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx)
801 ; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx)
802 ; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx)
803 ; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx)
804 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
805 ; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx)
808 ; AVX512-FCP-LABEL: store_i8_stride2_vf64:
809 ; AVX512-FCP: # %bb.0:
810 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
811 ; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
812 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
813 ; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
814 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
815 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
816 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
817 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
818 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
819 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
820 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
821 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
822 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
823 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
824 ; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
825 ; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
826 ; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
827 ; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
828 ; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
829 ; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
830 ; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
831 ; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
832 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
833 ; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
834 ; AVX512-FCP-NEXT: retq
836 ; AVX512DQ-LABEL: store_i8_stride2_vf64:
838 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
839 ; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
840 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2
841 ; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm3
842 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4
843 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
844 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
845 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7
846 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
847 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
848 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
849 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
850 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
851 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
852 ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
853 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
854 ; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx)
855 ; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx)
856 ; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx)
857 ; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx)
858 ; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx)
859 ; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx)
860 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
861 ; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx)
862 ; AVX512DQ-NEXT: retq
864 ; AVX512DQ-FCP-LABEL: store_i8_stride2_vf64:
865 ; AVX512DQ-FCP: # %bb.0:
866 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
867 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
868 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
869 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
870 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
871 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
872 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
873 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
874 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
875 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
876 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
877 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
878 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
879 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
880 ; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
881 ; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
882 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx)
883 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx)
884 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx)
885 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx)
886 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx)
887 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx)
888 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
889 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx)
890 ; AVX512DQ-FCP-NEXT: retq
892 ; AVX512BW-LABEL: store_i8_stride2_vf64:
894 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm0
895 ; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm1
896 ; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm2
897 ; AVX512BW-NEXT: vmovdqa 48(%rsi), %xmm3
898 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm4
899 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5
900 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6
901 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm7
902 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
903 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
904 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
905 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
906 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
907 ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
908 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
909 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
910 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
911 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
912 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
913 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
914 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
915 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
916 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
917 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
918 ; AVX512BW-NEXT: vzeroupper
919 ; AVX512BW-NEXT: retq
921 ; AVX512BW-FCP-LABEL: store_i8_stride2_vf64:
922 ; AVX512BW-FCP: # %bb.0:
923 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0
924 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
925 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
926 ; AVX512BW-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
927 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
928 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
929 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
930 ; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
931 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
932 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
933 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
934 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
935 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
936 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
937 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
938 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
939 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
940 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
941 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
942 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
943 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
944 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
945 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
946 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
947 ; AVX512BW-FCP-NEXT: vzeroupper
948 ; AVX512BW-FCP-NEXT: retq
950 ; AVX512DQ-BW-LABEL: store_i8_stride2_vf64:
951 ; AVX512DQ-BW: # %bb.0:
952 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm0
953 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm1
954 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm2
955 ; AVX512DQ-BW-NEXT: vmovdqa 48(%rsi), %xmm3
956 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm4
957 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm5
958 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm6
959 ; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm7
960 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
961 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
962 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
963 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
964 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
965 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
966 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
967 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
968 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
969 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
970 ; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
971 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
972 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
973 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
974 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx)
975 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rdx)
976 ; AVX512DQ-BW-NEXT: vzeroupper
977 ; AVX512DQ-BW-NEXT: retq
979 ; AVX512DQ-BW-FCP-LABEL: store_i8_stride2_vf64:
980 ; AVX512DQ-BW-FCP: # %bb.0:
981 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0
982 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rsi), %xmm1
983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rsi), %xmm3
985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
987 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
988 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm7
989 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15]
990 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
991 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
992 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
993 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
994 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm2, %ymm2
995 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
996 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
997 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
998 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
999 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1000 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1001 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
1002 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx)
1004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
1005 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1006 ; AVX512DQ-BW-FCP-NEXT: retq
1007 %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
1008 %in.vec1 = load <64 x i8>, ptr %in.vecptr1, align 64
1009 %1 = shufflevector <64 x i8> %in.vec0, <64 x i8> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
1010 %interleaved.vec = shufflevector <128 x i8> %1, <128 x i8> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
1011 store <128 x i8> %interleaved.vec, ptr %out.vec, align 64