1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride2_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
19 ; SSE-LABEL: load_i8_stride2_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
23 ; SSE-NEXT: pand %xmm0, %xmm1
24 ; SSE-NEXT: packuswb %xmm1, %xmm1
25 ; SSE-NEXT: psrlw $8, %xmm0
26 ; SSE-NEXT: packuswb %xmm0, %xmm0
27 ; SSE-NEXT: movd %xmm1, %eax
28 ; SSE-NEXT: movw %ax, (%rsi)
29 ; SSE-NEXT: movd %xmm0, %eax
30 ; SSE-NEXT: movw %ax, (%rdx)
33 ; AVX1-LABEL: load_i8_stride2_vf2:
35 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
36 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
37 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
38 ; AVX1-NEXT: vpextrw $0, %xmm1, (%rsi)
39 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rdx)
42 ; AVX512F-LABEL: load_i8_stride2_vf2:
44 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
45 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
46 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
47 ; AVX512F-NEXT: vpextrw $0, %xmm1, (%rsi)
48 ; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdx)
51 ; AVX512BW-LABEL: load_i8_stride2_vf2:
53 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
54 ; AVX512BW-NEXT: vpmovwb %xmm0, %xmm1
55 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
56 ; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi)
57 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdx)
59 %wide.vec = load <4 x i8>, ptr %in.vec, align 64
60 %strided.vec0 = shufflevector <4 x i8> %wide.vec, <4 x i8> poison, <2 x i32> <i32 0, i32 2>
61 %strided.vec1 = shufflevector <4 x i8> %wide.vec, <4 x i8> poison, <2 x i32> <i32 1, i32 3>
62 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
63 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
67 define void @load_i8_stride2_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
68 ; SSE-LABEL: load_i8_stride2_vf4:
70 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
71 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
72 ; SSE-NEXT: pand %xmm0, %xmm1
73 ; SSE-NEXT: packuswb %xmm1, %xmm1
74 ; SSE-NEXT: psrlw $8, %xmm0
75 ; SSE-NEXT: packuswb %xmm0, %xmm0
76 ; SSE-NEXT: movd %xmm1, (%rsi)
77 ; SSE-NEXT: movd %xmm0, (%rdx)
80 ; AVX1-LABEL: load_i8_stride2_vf4:
82 ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
83 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
84 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
85 ; AVX1-NEXT: vmovd %xmm1, (%rsi)
86 ; AVX1-NEXT: vmovd %xmm0, (%rdx)
89 ; AVX512F-LABEL: load_i8_stride2_vf4:
91 ; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
92 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
93 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
94 ; AVX512F-NEXT: vmovd %xmm1, (%rsi)
95 ; AVX512F-NEXT: vmovd %xmm0, (%rdx)
98 ; AVX512BW-LABEL: load_i8_stride2_vf4:
100 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
101 ; AVX512BW-NEXT: vpmovwb %xmm0, %xmm1
102 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,u,u,u,u,u,u,u,u,u,u,u,u]
103 ; AVX512BW-NEXT: vmovd %xmm1, (%rsi)
104 ; AVX512BW-NEXT: vmovd %xmm0, (%rdx)
105 ; AVX512BW-NEXT: retq
106 %wide.vec = load <8 x i8>, ptr %in.vec, align 64
107 %strided.vec0 = shufflevector <8 x i8> %wide.vec, <8 x i8> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
108 %strided.vec1 = shufflevector <8 x i8> %wide.vec, <8 x i8> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
109 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
110 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
114 define void @load_i8_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
115 ; SSE-LABEL: load_i8_stride2_vf8:
117 ; SSE-NEXT: movdqa (%rdi), %xmm0
118 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
119 ; SSE-NEXT: pand %xmm0, %xmm1
120 ; SSE-NEXT: packuswb %xmm1, %xmm1
121 ; SSE-NEXT: psrlw $8, %xmm0
122 ; SSE-NEXT: packuswb %xmm0, %xmm0
123 ; SSE-NEXT: movq %xmm1, (%rsi)
124 ; SSE-NEXT: movq %xmm0, (%rdx)
127 ; AVX1-LABEL: load_i8_stride2_vf8:
129 ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
130 ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
131 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
132 ; AVX1-NEXT: vmovq %xmm1, (%rsi)
133 ; AVX1-NEXT: vmovq %xmm0, (%rdx)
136 ; AVX512F-LABEL: load_i8_stride2_vf8:
138 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
139 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
140 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
141 ; AVX512F-NEXT: vmovq %xmm1, (%rsi)
142 ; AVX512F-NEXT: vmovq %xmm0, (%rdx)
145 ; AVX512BW-LABEL: load_i8_stride2_vf8:
147 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
148 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
149 ; AVX512BW-NEXT: vpmovwb %xmm0, (%rsi)
150 ; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
151 ; AVX512BW-NEXT: retq
152 %wide.vec = load <16 x i8>, ptr %in.vec, align 64
153 %strided.vec0 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
154 %strided.vec1 = shufflevector <16 x i8> %wide.vec, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
155 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
156 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
160 define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
161 ; SSE-LABEL: load_i8_stride2_vf16:
163 ; SSE-NEXT: movdqa (%rdi), %xmm0
164 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
165 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
166 ; SSE-NEXT: movdqa %xmm1, %xmm3
167 ; SSE-NEXT: pand %xmm2, %xmm3
168 ; SSE-NEXT: pand %xmm0, %xmm2
169 ; SSE-NEXT: packuswb %xmm3, %xmm2
170 ; SSE-NEXT: psrlw $8, %xmm1
171 ; SSE-NEXT: psrlw $8, %xmm0
172 ; SSE-NEXT: packuswb %xmm1, %xmm0
173 ; SSE-NEXT: movdqa %xmm2, (%rsi)
174 ; SSE-NEXT: movdqa %xmm0, (%rdx)
177 ; AVX1-ONLY-LABEL: load_i8_stride2_vf16:
178 ; AVX1-ONLY: # %bb.0:
179 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
180 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
181 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
182 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
183 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
184 ; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
185 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
186 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
187 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
188 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
189 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
190 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
191 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx)
192 ; AVX1-ONLY-NEXT: retq
194 ; AVX2-ONLY-LABEL: load_i8_stride2_vf16:
195 ; AVX2-ONLY: # %bb.0:
196 ; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
197 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
198 ; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
199 ; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
200 ; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
201 ; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
202 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
203 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
204 ; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
205 ; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
206 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
207 ; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rdx)
208 ; AVX2-ONLY-NEXT: retq
210 ; AVX512F-LABEL: load_i8_stride2_vf16:
212 ; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
213 ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
214 ; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm2
215 ; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
216 ; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
217 ; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
218 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
219 ; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
220 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
221 ; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
222 ; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
223 ; AVX512F-NEXT: vmovdqa %xmm1, (%rdx)
226 ; AVX512BW-LABEL: load_i8_stride2_vf16:
228 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
229 ; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm1
230 ; AVX512BW-NEXT: vpmovwb %ymm0, (%rsi)
231 ; AVX512BW-NEXT: vpmovwb %ymm1, (%rdx)
232 ; AVX512BW-NEXT: vzeroupper
233 ; AVX512BW-NEXT: retq
234 %wide.vec = load <32 x i8>, ptr %in.vec, align 64
235 %strided.vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
236 %strided.vec1 = shufflevector <32 x i8> %wide.vec, <32 x i8> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
237 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
238 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
242 define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
243 ; SSE-LABEL: load_i8_stride2_vf32:
245 ; SSE-NEXT: movdqa (%rdi), %xmm0
246 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
247 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
248 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
249 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
250 ; SSE-NEXT: movdqa %xmm3, %xmm5
251 ; SSE-NEXT: pand %xmm4, %xmm5
252 ; SSE-NEXT: movdqa %xmm2, %xmm6
253 ; SSE-NEXT: pand %xmm4, %xmm6
254 ; SSE-NEXT: packuswb %xmm5, %xmm6
255 ; SSE-NEXT: movdqa %xmm1, %xmm5
256 ; SSE-NEXT: pand %xmm4, %xmm5
257 ; SSE-NEXT: pand %xmm0, %xmm4
258 ; SSE-NEXT: packuswb %xmm5, %xmm4
259 ; SSE-NEXT: psrlw $8, %xmm3
260 ; SSE-NEXT: psrlw $8, %xmm2
261 ; SSE-NEXT: packuswb %xmm3, %xmm2
262 ; SSE-NEXT: psrlw $8, %xmm1
263 ; SSE-NEXT: psrlw $8, %xmm0
264 ; SSE-NEXT: packuswb %xmm1, %xmm0
265 ; SSE-NEXT: movdqa %xmm4, (%rsi)
266 ; SSE-NEXT: movdqa %xmm6, 16(%rsi)
267 ; SSE-NEXT: movdqa %xmm0, (%rdx)
268 ; SSE-NEXT: movdqa %xmm2, 16(%rdx)
271 ; AVX1-ONLY-LABEL: load_i8_stride2_vf32:
272 ; AVX1-ONLY: # %bb.0:
273 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
274 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
275 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
276 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm3
277 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4
278 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm4, %xmm5
279 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm3, %xmm6
280 ; AVX1-ONLY-NEXT: vpackuswb %xmm5, %xmm6, %xmm5
281 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6
282 ; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
283 ; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
284 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
285 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
286 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
287 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
288 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
289 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2
290 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1
291 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
292 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
293 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
294 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 16(%rsi)
295 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
296 ; AVX1-ONLY-NEXT: vzeroupper
297 ; AVX1-ONLY-NEXT: retq
299 ; AVX2-ONLY-LABEL: load_i8_stride2_vf32:
300 ; AVX2-ONLY: # %bb.0:
301 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0
302 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1
303 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
304 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
305 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
306 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
307 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
308 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
309 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
310 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
311 ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi)
312 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
313 ; AVX2-ONLY-NEXT: vzeroupper
314 ; AVX2-ONLY-NEXT: retq
316 ; AVX512F-SLOW-LABEL: load_i8_stride2_vf32:
317 ; AVX512F-SLOW: # %bb.0:
318 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
319 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1
320 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
321 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
322 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
323 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
324 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
325 ; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
326 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
327 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
328 ; AVX512F-SLOW-NEXT: vmovdqa %ymm2, (%rsi)
329 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
330 ; AVX512F-SLOW-NEXT: vzeroupper
331 ; AVX512F-SLOW-NEXT: retq
333 ; AVX512F-FAST-LABEL: load_i8_stride2_vf32:
334 ; AVX512F-FAST: # %bb.0:
335 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
336 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
337 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
338 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
339 ; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,5,7]
340 ; AVX512F-FAST-NEXT: vpermt2q %ymm2, %ymm4, %ymm3
341 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
342 ; AVX512F-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
343 ; AVX512F-FAST-NEXT: vpermt2q %ymm1, %ymm4, %ymm0
344 ; AVX512F-FAST-NEXT: vmovdqa %ymm3, (%rsi)
345 ; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rdx)
346 ; AVX512F-FAST-NEXT: vzeroupper
347 ; AVX512F-FAST-NEXT: retq
349 ; AVX512BW-LABEL: load_i8_stride2_vf32:
351 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
352 ; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm1
353 ; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
354 ; AVX512BW-NEXT: vpmovwb %zmm1, (%rdx)
355 ; AVX512BW-NEXT: vzeroupper
356 ; AVX512BW-NEXT: retq
357 %wide.vec = load <64 x i8>, ptr %in.vec, align 64
358 %strided.vec0 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
359 %strided.vec1 = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
360 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
361 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
365 define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
366 ; SSE-LABEL: load_i8_stride2_vf64:
368 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
369 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
370 ; SSE-NEXT: movdqa 96(%rdi), %xmm1
371 ; SSE-NEXT: movdqa 112(%rdi), %xmm7
372 ; SSE-NEXT: movdqa (%rdi), %xmm2
373 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
374 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
375 ; SSE-NEXT: movdqa 48(%rdi), %xmm11
376 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
377 ; SSE-NEXT: movdqa %xmm11, %xmm8
378 ; SSE-NEXT: pand %xmm6, %xmm8
379 ; SSE-NEXT: movdqa %xmm3, %xmm5
380 ; SSE-NEXT: pand %xmm6, %xmm5
381 ; SSE-NEXT: packuswb %xmm8, %xmm5
382 ; SSE-NEXT: movdqa %xmm9, %xmm10
383 ; SSE-NEXT: pand %xmm6, %xmm10
384 ; SSE-NEXT: movdqa %xmm2, %xmm8
385 ; SSE-NEXT: pand %xmm6, %xmm8
386 ; SSE-NEXT: packuswb %xmm10, %xmm8
387 ; SSE-NEXT: movdqa %xmm7, %xmm12
388 ; SSE-NEXT: pand %xmm6, %xmm12
389 ; SSE-NEXT: movdqa %xmm1, %xmm10
390 ; SSE-NEXT: pand %xmm6, %xmm10
391 ; SSE-NEXT: packuswb %xmm12, %xmm10
392 ; SSE-NEXT: movdqa %xmm4, %xmm12
393 ; SSE-NEXT: pand %xmm6, %xmm12
394 ; SSE-NEXT: pand %xmm0, %xmm6
395 ; SSE-NEXT: packuswb %xmm12, %xmm6
396 ; SSE-NEXT: psrlw $8, %xmm11
397 ; SSE-NEXT: psrlw $8, %xmm3
398 ; SSE-NEXT: packuswb %xmm11, %xmm3
399 ; SSE-NEXT: psrlw $8, %xmm9
400 ; SSE-NEXT: psrlw $8, %xmm2
401 ; SSE-NEXT: packuswb %xmm9, %xmm2
402 ; SSE-NEXT: psrlw $8, %xmm7
403 ; SSE-NEXT: psrlw $8, %xmm1
404 ; SSE-NEXT: packuswb %xmm7, %xmm1
405 ; SSE-NEXT: psrlw $8, %xmm4
406 ; SSE-NEXT: psrlw $8, %xmm0
407 ; SSE-NEXT: packuswb %xmm4, %xmm0
408 ; SSE-NEXT: movdqa %xmm6, 32(%rsi)
409 ; SSE-NEXT: movdqa %xmm10, 48(%rsi)
410 ; SSE-NEXT: movdqa %xmm8, (%rsi)
411 ; SSE-NEXT: movdqa %xmm5, 16(%rsi)
412 ; SSE-NEXT: movdqa %xmm0, 32(%rdx)
413 ; SSE-NEXT: movdqa %xmm1, 48(%rdx)
414 ; SSE-NEXT: movdqa %xmm2, (%rdx)
415 ; SSE-NEXT: movdqa %xmm3, 16(%rdx)
418 ; AVX1-ONLY-LABEL: load_i8_stride2_vf64:
419 ; AVX1-ONLY: # %bb.0:
420 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
421 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
422 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm2, %xmm0
423 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
424 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm3, %xmm4
425 ; AVX1-ONLY-NEXT: vpackuswb %xmm0, %xmm4, %xmm0
426 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
427 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm4, %xmm5
428 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm6
429 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm6, %xmm7
430 ; AVX1-ONLY-NEXT: vpackuswb %xmm5, %xmm7, %xmm5
431 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7
432 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm8
433 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9
434 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10
435 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm10, %xmm11
436 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm9, %xmm12
437 ; AVX1-ONLY-NEXT: vpackuswb %xmm11, %xmm12, %xmm11
438 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12
439 ; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1
440 ; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1
441 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
442 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
443 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
444 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3
445 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
446 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm3
447 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm4
448 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
449 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
450 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm10, %xmm3
451 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm9, %xmm4
452 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
453 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm8, %xmm4
454 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm7, %xmm6
455 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
456 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
457 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rsi)
458 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, 16(%rsi)
459 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, 32(%rsi)
460 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, 48(%rsi)
461 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rdx)
462 ; AVX1-ONLY-NEXT: vmovaps %ymm2, 32(%rdx)
463 ; AVX1-ONLY-NEXT: vzeroupper
464 ; AVX1-ONLY-NEXT: retq
466 ; AVX2-ONLY-LABEL: load_i8_stride2_vf64:
467 ; AVX2-ONLY: # %bb.0:
468 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0
469 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1
470 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2
471 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm3
472 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
473 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm5
474 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
475 ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm2, %ymm7
476 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
477 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,1,3]
478 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm1, %ymm4
479 ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm0, %ymm6
480 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
481 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,1,3]
482 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
483 ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm3, %ymm3
484 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
485 ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm2, %ymm2
486 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
487 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
488 ; AVX2-ONLY-NEXT: vpshufb %ymm6, %ymm1, %ymm1
489 ; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm0
490 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
491 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
492 ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rsi)
493 ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%rsi)
494 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
495 ; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rdx)
496 ; AVX2-ONLY-NEXT: vzeroupper
497 ; AVX2-ONLY-NEXT: retq
499 ; AVX512F-SLOW-LABEL: load_i8_stride2_vf64:
500 ; AVX512F-SLOW: # %bb.0:
501 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
502 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm1
503 ; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2
504 ; AVX512F-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3
505 ; AVX512F-SLOW-NEXT: vmovdqa 96(%rdi), %ymm4
506 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm4, %ymm5
507 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
508 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm7
509 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7]
510 ; AVX512F-SLOW-NEXT: vpshufb %ymm0, %ymm2, %ymm0
511 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm6
512 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7]
513 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
514 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7]
515 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
516 ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4
517 ; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
518 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3
519 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
520 ; AVX512F-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2
521 ; AVX512F-SLOW-NEXT: vpshufb %ymm6, %ymm1, %ymm1
522 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5],ymm2[6,7]
523 ; AVX512F-SLOW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
524 ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,2,1,3,4,6,5,7]
525 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm0, (%rsi)
526 ; AVX512F-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
527 ; AVX512F-SLOW-NEXT: vzeroupper
528 ; AVX512F-SLOW-NEXT: retq
530 ; AVX512F-FAST-LABEL: load_i8_stride2_vf64:
531 ; AVX512F-FAST: # %bb.0:
532 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
533 ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1
534 ; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm2
535 ; AVX512F-FAST-NEXT: vmovdqa 64(%rdi), %ymm3
536 ; AVX512F-FAST-NEXT: vmovdqa 96(%rdi), %ymm4
537 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm5
538 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
539 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
540 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
541 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm3, %ymm6
542 ; AVX512F-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm5
543 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
544 ; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,2,9,11,4,6,13,15]
545 ; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm5
546 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
547 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm4, %ymm4
548 ; AVX512F-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm0
549 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
550 ; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
551 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3
552 ; AVX512F-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1
553 ; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
554 ; AVX512F-FAST-NEXT: vpermt2q %zmm0, %zmm6, %zmm1
555 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm5, (%rsi)
556 ; AVX512F-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
557 ; AVX512F-FAST-NEXT: vzeroupper
558 ; AVX512F-FAST-NEXT: retq
560 ; AVX512BW-LABEL: load_i8_stride2_vf64:
562 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
563 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
564 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62]
565 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u,32,34,36,38,40,42,44,46,u,u,u,u,u,u,u,u,48,50,52,54,56,58,60,62,u,u,u,u,u,u,u,u]
566 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,9,11,13,15]
567 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm3
568 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63]
569 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u,33,35,37,39,41,43,45,47,u,u,u,u,u,u,u,u,49,51,53,55,57,59,61,63,u,u,u,u,u,u,u,u]
570 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm4, %zmm0
571 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rsi)
572 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
573 ; AVX512BW-NEXT: vzeroupper
574 ; AVX512BW-NEXT: retq
575 %wide.vec = load <128 x i8>, ptr %in.vec, align 64
576 %strided.vec0 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <64 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62, i32 64, i32 66, i32 68, i32 70, i32 72, i32 74, i32 76, i32 78, i32 80, i32 82, i32 84, i32 86, i32 88, i32 90, i32 92, i32 94, i32 96, i32 98, i32 100, i32 102, i32 104, i32 106, i32 108, i32 110, i32 112, i32 114, i32 116, i32 118, i32 120, i32 122, i32 124, i32 126>
577 %strided.vec1 = shufflevector <128 x i8> %wide.vec, <128 x i8> poison, <64 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63, i32 65, i32 67, i32 69, i32 71, i32 73, i32 75, i32 77, i32 79, i32 81, i32 83, i32 85, i32 87, i32 89, i32 91, i32 93, i32 95, i32 97, i32 99, i32 101, i32 103, i32 105, i32 107, i32 109, i32 111, i32 113, i32 115, i32 117, i32 119, i32 121, i32 123, i32 125, i32 127>
578 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
579 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
582 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
586 ; AVX2-FAST-PERLANE: {{.*}}
589 ; AVX512-FAST: {{.*}}
590 ; AVX512-SLOW: {{.*}}
591 ; AVX512BW-FAST: {{.*}}
592 ; AVX512BW-ONLY: {{.*}}
593 ; AVX512BW-ONLY-FAST: {{.*}}
594 ; AVX512BW-ONLY-SLOW: {{.*}}
595 ; AVX512BW-SLOW: {{.*}}
596 ; AVX512DQ-FAST: {{.*}}
597 ; AVX512DQ-ONLY: {{.*}}
598 ; AVX512DQ-SLOW: {{.*}}
599 ; AVX512DQBW-FAST: {{.*}}
600 ; AVX512DQBW-ONLY: {{.*}}
601 ; AVX512DQBW-SLOW: {{.*}}
602 ; AVX512F-ONLY: {{.*}}
603 ; AVX512F-ONLY-FAST: {{.*}}
604 ; AVX512F-ONLY-SLOW: {{.*}}