1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
19 ; SSE-LABEL: load_i8_stride3_vf2:
21 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
22 ; SSE-NEXT: pxor %xmm1, %xmm1
23 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
24 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
25 ; SSE-NEXT: packuswb %xmm1, %xmm1
26 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
27 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
28 ; SSE-NEXT: packuswb %xmm2, %xmm2
29 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
30 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
31 ; SSE-NEXT: packuswb %xmm0, %xmm0
32 ; SSE-NEXT: movd %xmm1, %eax
33 ; SSE-NEXT: movw %ax, (%rsi)
34 ; SSE-NEXT: movd %xmm2, %eax
35 ; SSE-NEXT: movw %ax, (%rdx)
36 ; SSE-NEXT: movd %xmm0, %eax
37 ; SSE-NEXT: movw %ax, (%rcx)
40 ; AVX-LABEL: load_i8_stride3_vf2:
42 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
43 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
44 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
45 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
46 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
47 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
48 ; AVX-NEXT: vpextrw $0, %xmm0, (%rcx)
51 ; AVX2-LABEL: load_i8_stride3_vf2:
53 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
54 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
55 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
56 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
57 ; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi)
58 ; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx)
59 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rcx)
62 ; AVX2-FP-LABEL: load_i8_stride3_vf2:
64 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
65 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
66 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
67 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
68 ; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi)
69 ; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx)
70 ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rcx)
73 ; AVX2-FCP-LABEL: load_i8_stride3_vf2:
75 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
76 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
77 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
78 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
79 ; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
80 ; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
81 ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rcx)
84 ; AVX512-LABEL: load_i8_stride3_vf2:
86 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
87 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
88 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
89 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
90 ; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi)
91 ; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx)
92 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rcx)
95 ; AVX512-FCP-LABEL: load_i8_stride3_vf2:
96 ; AVX512-FCP: # %bb.0:
97 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
98 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
99 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
100 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
101 ; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
102 ; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
103 ; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rcx)
104 ; AVX512-FCP-NEXT: retq
106 ; AVX512DQ-LABEL: load_i8_stride3_vf2:
108 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
109 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
110 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
111 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
112 ; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi)
113 ; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx)
114 ; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rcx)
115 ; AVX512DQ-NEXT: retq
117 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf2:
118 ; AVX512DQ-FCP: # %bb.0:
119 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
120 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
121 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
123 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
124 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
125 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rcx)
126 ; AVX512DQ-FCP-NEXT: retq
128 ; AVX512BW-LABEL: load_i8_stride3_vf2:
130 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
131 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
132 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
133 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
134 ; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi)
135 ; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx)
136 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rcx)
137 ; AVX512BW-NEXT: retq
139 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf2:
140 ; AVX512BW-FCP: # %bb.0:
141 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
142 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
143 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
144 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
145 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
146 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
147 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rcx)
148 ; AVX512BW-FCP-NEXT: retq
150 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf2:
151 ; AVX512DQ-BW: # %bb.0:
152 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
153 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
154 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
155 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
156 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi)
157 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx)
158 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rcx)
159 ; AVX512DQ-BW-NEXT: retq
161 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf2:
162 ; AVX512DQ-BW-FCP: # %bb.0:
163 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
164 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
165 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
166 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
167 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
168 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
169 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rcx)
170 ; AVX512DQ-BW-FCP-NEXT: retq
171 %wide.vec = load <6 x i8>, ptr %in.vec, align 64
172 %strided.vec0 = shufflevector <6 x i8> %wide.vec, <6 x i8> poison, <2 x i32> <i32 0, i32 3>
173 %strided.vec1 = shufflevector <6 x i8> %wide.vec, <6 x i8> poison, <2 x i32> <i32 1, i32 4>
174 %strided.vec2 = shufflevector <6 x i8> %wide.vec, <6 x i8> poison, <2 x i32> <i32 2, i32 5>
175 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
176 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
177 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
181 define void @load_i8_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
182 ; SSE-LABEL: load_i8_stride3_vf4:
184 ; SSE-NEXT: movdqa (%rdi), %xmm0
185 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
186 ; SSE-NEXT: pxor %xmm2, %xmm2
187 ; SSE-NEXT: movdqa %xmm0, %xmm3
188 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
189 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
190 ; SSE-NEXT: movdqa %xmm0, %xmm2
191 ; SSE-NEXT: pand %xmm1, %xmm2
192 ; SSE-NEXT: pandn %xmm3, %xmm1
193 ; SSE-NEXT: por %xmm2, %xmm1
194 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
195 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
196 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
197 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
198 ; SSE-NEXT: packuswb %xmm1, %xmm1
199 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
200 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[0,0]
201 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,3]
202 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
203 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
204 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
205 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
206 ; SSE-NEXT: packuswb %xmm3, %xmm3
207 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
208 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
209 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
210 ; SSE-NEXT: packuswb %xmm0, %xmm0
211 ; SSE-NEXT: movd %xmm1, (%rsi)
212 ; SSE-NEXT: movd %xmm3, (%rdx)
213 ; SSE-NEXT: movd %xmm0, (%rcx)
216 ; AVX-LABEL: load_i8_stride3_vf4:
218 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
219 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
220 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
221 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
222 ; AVX-NEXT: vmovd %xmm1, (%rsi)
223 ; AVX-NEXT: vmovd %xmm2, (%rdx)
224 ; AVX-NEXT: vmovd %xmm0, (%rcx)
227 ; AVX2-LABEL: load_i8_stride3_vf4:
229 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
230 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
231 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
232 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
233 ; AVX2-NEXT: vmovd %xmm1, (%rsi)
234 ; AVX2-NEXT: vmovd %xmm2, (%rdx)
235 ; AVX2-NEXT: vmovd %xmm0, (%rcx)
238 ; AVX2-FP-LABEL: load_i8_stride3_vf4:
240 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
241 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
242 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
243 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
244 ; AVX2-FP-NEXT: vmovd %xmm1, (%rsi)
245 ; AVX2-FP-NEXT: vmovd %xmm2, (%rdx)
246 ; AVX2-FP-NEXT: vmovd %xmm0, (%rcx)
249 ; AVX2-FCP-LABEL: load_i8_stride3_vf4:
251 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
252 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
253 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
254 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
255 ; AVX2-FCP-NEXT: vmovd %xmm1, (%rsi)
256 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rdx)
257 ; AVX2-FCP-NEXT: vmovd %xmm0, (%rcx)
258 ; AVX2-FCP-NEXT: retq
260 ; AVX512-LABEL: load_i8_stride3_vf4:
262 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
263 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
264 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
265 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
266 ; AVX512-NEXT: vmovd %xmm1, (%rsi)
267 ; AVX512-NEXT: vmovd %xmm2, (%rdx)
268 ; AVX512-NEXT: vmovd %xmm0, (%rcx)
271 ; AVX512-FCP-LABEL: load_i8_stride3_vf4:
272 ; AVX512-FCP: # %bb.0:
273 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
274 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
275 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
276 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
277 ; AVX512-FCP-NEXT: vmovd %xmm1, (%rsi)
278 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rdx)
279 ; AVX512-FCP-NEXT: vmovd %xmm0, (%rcx)
280 ; AVX512-FCP-NEXT: retq
282 ; AVX512DQ-LABEL: load_i8_stride3_vf4:
284 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
285 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
286 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
287 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
288 ; AVX512DQ-NEXT: vmovd %xmm1, (%rsi)
289 ; AVX512DQ-NEXT: vmovd %xmm2, (%rdx)
290 ; AVX512DQ-NEXT: vmovd %xmm0, (%rcx)
291 ; AVX512DQ-NEXT: retq
293 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf4:
294 ; AVX512DQ-FCP: # %bb.0:
295 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
296 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
297 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
298 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
299 ; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rsi)
300 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rdx)
301 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rcx)
302 ; AVX512DQ-FCP-NEXT: retq
304 ; AVX512BW-LABEL: load_i8_stride3_vf4:
306 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
307 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
308 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
309 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
310 ; AVX512BW-NEXT: vmovd %xmm1, (%rsi)
311 ; AVX512BW-NEXT: vmovd %xmm2, (%rdx)
312 ; AVX512BW-NEXT: vmovd %xmm0, (%rcx)
313 ; AVX512BW-NEXT: retq
315 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf4:
316 ; AVX512BW-FCP: # %bb.0:
317 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
318 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
319 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
320 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
321 ; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rsi)
322 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rdx)
323 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rcx)
324 ; AVX512BW-FCP-NEXT: retq
326 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf4:
327 ; AVX512DQ-BW: # %bb.0:
328 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
329 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
330 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
331 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
332 ; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rsi)
333 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rdx)
334 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rcx)
335 ; AVX512DQ-BW-NEXT: retq
337 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf4:
338 ; AVX512DQ-BW-FCP: # %bb.0:
339 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
340 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,3,6,9,u,u,u,u,u,u,u,u,u,u,u,u]
341 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,4,7,10,u,u,u,u,u,u,u,u,u,u,u,u]
342 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,u,u,u,u,u,u,u,u,u,u,u,u]
343 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rsi)
344 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rdx)
345 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rcx)
346 ; AVX512DQ-BW-FCP-NEXT: retq
347 %wide.vec = load <12 x i8>, ptr %in.vec, align 64
348 %strided.vec0 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
349 %strided.vec1 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
350 %strided.vec2 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
351 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
352 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
353 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
357 define void @load_i8_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
358 ; SSE-LABEL: load_i8_stride3_vf8:
360 ; SSE-NEXT: movdqa (%rdi), %xmm0
361 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
362 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,255,255,255,255,255,255,255,255]
363 ; SSE-NEXT: movdqa %xmm0, %xmm2
364 ; SSE-NEXT: pand %xmm4, %xmm2
365 ; SSE-NEXT: pandn %xmm1, %xmm4
366 ; SSE-NEXT: por %xmm2, %xmm4
367 ; SSE-NEXT: pxor %xmm2, %xmm2
368 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
369 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
370 ; SSE-NEXT: pand %xmm5, %xmm4
371 ; SSE-NEXT: movdqa %xmm0, %xmm3
372 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
373 ; SSE-NEXT: pandn %xmm3, %xmm5
374 ; SSE-NEXT: por %xmm4, %xmm5
375 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,1,3]
376 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
377 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
378 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
379 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
380 ; SSE-NEXT: packuswb %xmm4, %xmm4
381 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,255,255,0,255,255,0,255,255,255,255,255,255,255,255,255]
382 ; SSE-NEXT: movdqa %xmm0, %xmm6
383 ; SSE-NEXT: pand %xmm5, %xmm6
384 ; SSE-NEXT: pandn %xmm1, %xmm5
385 ; SSE-NEXT: por %xmm6, %xmm5
386 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
387 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
388 ; SSE-NEXT: pand %xmm6, %xmm5
389 ; SSE-NEXT: pandn %xmm3, %xmm6
390 ; SSE-NEXT: por %xmm5, %xmm6
391 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm6[2,1,0,3,4,5,6,7]
392 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
393 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
394 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
395 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
396 ; SSE-NEXT: packuswb %xmm5, %xmm5
397 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,0,255,255,0,255,255,255,255,255,255,255,255]
398 ; SSE-NEXT: pand %xmm6, %xmm0
399 ; SSE-NEXT: pandn %xmm1, %xmm6
400 ; SSE-NEXT: por %xmm0, %xmm6
401 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
402 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,0,65535]
403 ; SSE-NEXT: pand %xmm0, %xmm6
404 ; SSE-NEXT: pandn %xmm3, %xmm0
405 ; SSE-NEXT: por %xmm6, %xmm0
406 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
407 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
408 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
409 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
410 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
411 ; SSE-NEXT: packuswb %xmm0, %xmm0
412 ; SSE-NEXT: movq %xmm4, (%rsi)
413 ; SSE-NEXT: movq %xmm5, (%rdx)
414 ; SSE-NEXT: movq %xmm0, (%rcx)
417 ; AVX-LABEL: load_i8_stride3_vf8:
419 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
420 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
421 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
422 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
423 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
424 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
425 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
426 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
427 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
428 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
429 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
430 ; AVX-NEXT: vmovq %xmm2, (%rsi)
431 ; AVX-NEXT: vmovq %xmm3, (%rdx)
432 ; AVX-NEXT: vmovq %xmm0, (%rcx)
435 ; AVX2-LABEL: load_i8_stride3_vf8:
437 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
438 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
439 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
440 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
441 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
442 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
443 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
444 ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
445 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
446 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
447 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
448 ; AVX2-NEXT: vmovq %xmm2, (%rsi)
449 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
450 ; AVX2-NEXT: vmovq %xmm0, (%rcx)
453 ; AVX2-FP-LABEL: load_i8_stride3_vf8:
455 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
456 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
457 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
458 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
459 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
460 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
461 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
462 ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
463 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
464 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
465 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
466 ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
467 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
468 ; AVX2-FP-NEXT: vmovq %xmm0, (%rcx)
471 ; AVX2-FCP-LABEL: load_i8_stride3_vf8:
473 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
474 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
475 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
476 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
477 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
478 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
479 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
480 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
481 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
482 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
483 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
484 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
485 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
486 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rcx)
487 ; AVX2-FCP-NEXT: retq
489 ; AVX512-LABEL: load_i8_stride3_vf8:
491 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
492 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
493 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
494 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
495 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
496 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
497 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
498 ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
499 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
500 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
501 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
502 ; AVX512-NEXT: vmovq %xmm2, (%rsi)
503 ; AVX512-NEXT: vmovq %xmm3, (%rdx)
504 ; AVX512-NEXT: vmovq %xmm0, (%rcx)
507 ; AVX512-FCP-LABEL: load_i8_stride3_vf8:
508 ; AVX512-FCP: # %bb.0:
509 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
510 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
511 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
512 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
513 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
514 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
515 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
516 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
517 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
518 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
519 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
520 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
521 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
522 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
523 ; AVX512-FCP-NEXT: retq
525 ; AVX512DQ-LABEL: load_i8_stride3_vf8:
527 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
528 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
529 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
530 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
531 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
532 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
533 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
534 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
535 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
536 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
537 ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
538 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
539 ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
540 ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
541 ; AVX512DQ-NEXT: retq
543 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf8:
544 ; AVX512DQ-FCP: # %bb.0:
545 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
546 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
547 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
548 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
549 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
550 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
551 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
552 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
553 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
554 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
555 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
556 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
557 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
558 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
559 ; AVX512DQ-FCP-NEXT: retq
561 ; AVX512BW-LABEL: load_i8_stride3_vf8:
563 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
564 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
565 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
566 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
567 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
568 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
569 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
570 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
571 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
572 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
573 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
574 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
575 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
576 ; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
577 ; AVX512BW-NEXT: retq
579 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf8:
580 ; AVX512BW-FCP: # %bb.0:
581 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
582 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
583 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
584 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
585 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
586 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
587 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
588 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
589 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
590 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
591 ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
592 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
593 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
594 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
595 ; AVX512BW-FCP-NEXT: retq
597 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf8:
598 ; AVX512DQ-BW: # %bb.0:
599 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
600 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
601 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
602 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
603 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
604 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
605 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
606 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
607 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
608 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
609 ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0
610 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
611 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
612 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
613 ; AVX512DQ-BW-NEXT: retq
615 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf8:
616 ; AVX512DQ-BW-FCP: # %bb.0:
617 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
619 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
620 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
621 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
622 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
623 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
624 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
625 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
626 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
627 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
628 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
629 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
630 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
631 ; AVX512DQ-BW-FCP-NEXT: retq
632 %wide.vec = load <24 x i8>, ptr %in.vec, align 64
633 %strided.vec0 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
634 %strided.vec1 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
635 %strided.vec2 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
636 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
637 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
638 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
642 define void @load_i8_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
643 ; SSE-LABEL: load_i8_stride3_vf16:
645 ; SSE-NEXT: movdqa (%rdi), %xmm5
646 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
647 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
648 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
649 ; SSE-NEXT: movdqa %xmm3, %xmm1
650 ; SSE-NEXT: pandn %xmm4, %xmm1
651 ; SSE-NEXT: movdqa %xmm5, %xmm2
652 ; SSE-NEXT: pand %xmm3, %xmm2
653 ; SSE-NEXT: por %xmm1, %xmm2
654 ; SSE-NEXT: pxor %xmm6, %xmm6
655 ; SSE-NEXT: movdqa %xmm2, %xmm7
656 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
657 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,65535,0]
658 ; SSE-NEXT: movdqa %xmm1, %xmm8
659 ; SSE-NEXT: pandn %xmm7, %xmm8
660 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
661 ; SSE-NEXT: pand %xmm1, %xmm2
662 ; SSE-NEXT: por %xmm8, %xmm2
663 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
664 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
665 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
666 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
667 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,5,4,7]
668 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7]
669 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
670 ; SSE-NEXT: packuswb %xmm2, %xmm10
671 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
672 ; SSE-NEXT: pand %xmm2, %xmm10
673 ; SSE-NEXT: movdqa %xmm0, %xmm7
674 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
675 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535]
676 ; SSE-NEXT: movdqa %xmm9, %xmm8
677 ; SSE-NEXT: pandn %xmm7, %xmm8
678 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
679 ; SSE-NEXT: movdqa %xmm0, %xmm11
680 ; SSE-NEXT: pand %xmm9, %xmm11
681 ; SSE-NEXT: por %xmm8, %xmm11
682 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,0]
683 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
684 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,0]
685 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
686 ; SSE-NEXT: packuswb %xmm8, %xmm11
687 ; SSE-NEXT: movdqa %xmm2, %xmm8
688 ; SSE-NEXT: pandn %xmm11, %xmm8
689 ; SSE-NEXT: por %xmm10, %xmm8
690 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
691 ; SSE-NEXT: movdqa %xmm4, %xmm11
692 ; SSE-NEXT: pand %xmm10, %xmm11
693 ; SSE-NEXT: pandn %xmm5, %xmm10
694 ; SSE-NEXT: por %xmm11, %xmm10
695 ; SSE-NEXT: movdqa %xmm10, %xmm11
696 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
697 ; SSE-NEXT: movdqa %xmm9, %xmm12
698 ; SSE-NEXT: pandn %xmm11, %xmm12
699 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
700 ; SSE-NEXT: pand %xmm9, %xmm10
701 ; SSE-NEXT: por %xmm12, %xmm10
702 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,1,0,3,4,5,6,7]
703 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
704 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
705 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[1,2,3,0,4,5,6,7]
706 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
707 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
708 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
709 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7]
710 ; SSE-NEXT: packuswb %xmm11, %xmm10
711 ; SSE-NEXT: pand %xmm2, %xmm10
712 ; SSE-NEXT: movdqa %xmm7, %xmm11
713 ; SSE-NEXT: pand %xmm9, %xmm11
714 ; SSE-NEXT: pandn %xmm0, %xmm9
715 ; SSE-NEXT: por %xmm11, %xmm9
716 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
717 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
718 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
719 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
720 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
721 ; SSE-NEXT: packuswb %xmm9, %xmm9
722 ; SSE-NEXT: pandn %xmm9, %xmm2
723 ; SSE-NEXT: por %xmm10, %xmm2
724 ; SSE-NEXT: pand %xmm3, %xmm4
725 ; SSE-NEXT: pandn %xmm5, %xmm3
726 ; SSE-NEXT: por %xmm4, %xmm3
727 ; SSE-NEXT: movdqa %xmm3, %xmm4
728 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
729 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535]
730 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
731 ; SSE-NEXT: pand %xmm5, %xmm3
732 ; SSE-NEXT: pandn %xmm4, %xmm5
733 ; SSE-NEXT: por %xmm3, %xmm5
734 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0]
735 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
736 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
737 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7]
738 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
739 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
740 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
741 ; SSE-NEXT: packuswb %xmm4, %xmm3
742 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
743 ; SSE-NEXT: pand %xmm4, %xmm3
744 ; SSE-NEXT: pand %xmm1, %xmm0
745 ; SSE-NEXT: pandn %xmm7, %xmm1
746 ; SSE-NEXT: por %xmm0, %xmm1
747 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
748 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
749 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
750 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
751 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
752 ; SSE-NEXT: packuswb %xmm0, %xmm0
753 ; SSE-NEXT: pandn %xmm0, %xmm4
754 ; SSE-NEXT: por %xmm3, %xmm4
755 ; SSE-NEXT: movdqa %xmm8, (%rsi)
756 ; SSE-NEXT: movdqa %xmm2, (%rdx)
757 ; SSE-NEXT: movdqa %xmm4, (%rcx)
760 ; AVX-LABEL: load_i8_stride3_vf16:
762 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
763 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
764 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
765 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
766 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
767 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
768 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
769 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
770 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
771 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
772 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
773 ; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215]
774 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
775 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
776 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
777 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
778 ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
779 ; AVX-NEXT: vmovdqa %xmm2, (%rcx)
782 ; AVX2-LABEL: load_i8_stride3_vf16:
784 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
785 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
786 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
787 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
788 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
789 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
790 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
791 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
792 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
793 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
794 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
795 ; AVX2-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215]
796 ; AVX2-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
797 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
798 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
799 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
800 ; AVX2-NEXT: vmovdqa %xmm1, (%rdx)
801 ; AVX2-NEXT: vmovdqa %xmm2, (%rcx)
804 ; AVX2-FP-LABEL: load_i8_stride3_vf16:
806 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
807 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
808 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
809 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
810 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
811 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
812 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
813 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
814 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
815 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
816 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
817 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215]
818 ; AVX2-FP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
819 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
820 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
821 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi)
822 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rdx)
823 ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rcx)
826 ; AVX2-FCP-LABEL: load_i8_stride3_vf16:
828 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
829 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
830 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
831 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
832 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
833 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
834 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
835 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
836 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
837 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
838 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
839 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215]
840 ; AVX2-FCP-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
841 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
842 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
843 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi)
844 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rdx)
845 ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rcx)
846 ; AVX2-FCP-NEXT: retq
848 ; AVX512-LABEL: load_i8_stride3_vf16:
850 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
851 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
852 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
853 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
854 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
855 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
856 ; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
857 ; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
858 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
859 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
860 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
861 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
862 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
863 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
864 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
865 ; AVX512-NEXT: vmovdqa %xmm1, (%rdx)
866 ; AVX512-NEXT: vmovdqa %xmm2, (%rcx)
869 ; AVX512-FCP-LABEL: load_i8_stride3_vf16:
870 ; AVX512-FCP: # %bb.0:
871 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
872 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
873 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
874 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
875 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
876 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
877 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
878 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
879 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
880 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
881 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
882 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
883 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
884 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
885 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
886 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rdx)
887 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx)
888 ; AVX512-FCP-NEXT: retq
890 ; AVX512DQ-LABEL: load_i8_stride3_vf16:
892 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
893 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
894 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
895 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
896 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
897 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
898 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm2, %xmm2
899 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
900 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
901 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
902 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
903 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
904 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
905 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
906 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
907 ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rdx)
908 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rcx)
909 ; AVX512DQ-NEXT: retq
911 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf16:
912 ; AVX512DQ-FCP: # %bb.0:
913 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
914 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
915 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
916 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
917 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
918 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
919 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
920 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
921 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
922 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
923 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
924 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
925 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
926 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
927 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
928 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rdx)
929 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx)
930 ; AVX512DQ-FCP-NEXT: retq
932 ; AVX512BW-LABEL: load_i8_stride3_vf16:
934 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
935 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
936 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
937 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
938 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
939 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
940 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
941 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
942 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
943 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
944 ; AVX512BW-NEXT: movw $-2048, %ax # imm = 0xF800
945 ; AVX512BW-NEXT: kmovd %eax, %k1
946 ; AVX512BW-NEXT: vpblendmb %xmm1, %xmm0, %xmm2 {%k1}
947 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
948 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
949 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
950 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
951 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx)
952 ; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx)
953 ; AVX512BW-NEXT: retq
955 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf16:
956 ; AVX512BW-FCP: # %bb.0:
957 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
958 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
959 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
960 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
961 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
962 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
963 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
964 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
965 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
966 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
967 ; AVX512BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
968 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
969 ; AVX512BW-FCP-NEXT: vpblendmb %xmm1, %xmm0, %xmm2 {%k1}
970 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
971 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
972 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
973 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
974 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rdx)
975 ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
976 ; AVX512BW-FCP-NEXT: retq
978 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf16:
979 ; AVX512DQ-BW: # %bb.0:
980 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
981 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
982 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
983 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
984 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
985 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
986 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
987 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
988 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
989 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
990 ; AVX512DQ-BW-NEXT: movw $-2048, %ax # imm = 0xF800
991 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
992 ; AVX512DQ-BW-NEXT: vpblendmb %xmm1, %xmm0, %xmm2 {%k1}
993 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
994 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
995 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
996 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
997 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx)
998 ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx)
999 ; AVX512DQ-BW-NEXT: retq
1001 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf16:
1002 ; AVX512DQ-BW-FCP: # %bb.0:
1003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1005 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1007 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1008 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1009 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1010 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1011 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1012 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1013 ; AVX512DQ-BW-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
1014 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
1015 ; AVX512DQ-BW-FCP-NEXT: vpblendmb %xmm1, %xmm0, %xmm2 {%k1}
1016 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1017 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1018 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1020 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rdx)
1021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rcx)
1022 ; AVX512DQ-BW-FCP-NEXT: retq
1023 %wide.vec = load <48 x i8>, ptr %in.vec, align 64
1024 %strided.vec0 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
1025 %strided.vec1 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
1026 %strided.vec2 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
1027 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
1028 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
1029 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
1033 define void @load_i8_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
1034 ; SSE-LABEL: load_i8_stride3_vf32:
1036 ; SSE-NEXT: movdqa 64(%rdi), %xmm2
1037 ; SSE-NEXT: movdqa (%rdi), %xmm6
1038 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
1039 ; SSE-NEXT: movdqa 32(%rdi), %xmm8
1040 ; SSE-NEXT: movdqa 48(%rdi), %xmm12
1041 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1042 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
1043 ; SSE-NEXT: movdqa %xmm13, %xmm7
1044 ; SSE-NEXT: pandn %xmm4, %xmm7
1045 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
1046 ; SSE-NEXT: movdqa %xmm5, %xmm14
1047 ; SSE-NEXT: pandn %xmm6, %xmm14
1048 ; SSE-NEXT: movdqa %xmm13, %xmm0
1049 ; SSE-NEXT: pandn %xmm6, %xmm0
1050 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1051 ; SSE-NEXT: movdqa %xmm6, %xmm0
1052 ; SSE-NEXT: pand %xmm13, %xmm0
1053 ; SSE-NEXT: por %xmm7, %xmm0
1054 ; SSE-NEXT: pxor %xmm9, %xmm9
1055 ; SSE-NEXT: movdqa %xmm0, %xmm1
1056 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
1057 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
1058 ; SSE-NEXT: movdqa %xmm6, %xmm3
1059 ; SSE-NEXT: pandn %xmm1, %xmm3
1060 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1061 ; SSE-NEXT: pand %xmm6, %xmm0
1062 ; SSE-NEXT: por %xmm3, %xmm0
1063 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1064 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1065 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1066 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1067 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1068 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1069 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1070 ; SSE-NEXT: packuswb %xmm1, %xmm0
1071 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1072 ; SSE-NEXT: pand %xmm7, %xmm0
1073 ; SSE-NEXT: movdqa %xmm8, %xmm3
1074 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
1075 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
1076 ; SSE-NEXT: movdqa %xmm15, %xmm1
1077 ; SSE-NEXT: pandn %xmm3, %xmm1
1078 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1079 ; SSE-NEXT: movdqa %xmm8, %xmm10
1080 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1081 ; SSE-NEXT: pand %xmm15, %xmm10
1082 ; SSE-NEXT: por %xmm1, %xmm10
1083 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,0]
1084 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1085 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
1086 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1087 ; SSE-NEXT: packuswb %xmm1, %xmm1
1088 ; SSE-NEXT: movdqa %xmm7, %xmm10
1089 ; SSE-NEXT: pandn %xmm1, %xmm10
1090 ; SSE-NEXT: por %xmm0, %xmm10
1091 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1092 ; SSE-NEXT: movdqa %xmm13, %xmm0
1093 ; SSE-NEXT: pandn %xmm2, %xmm0
1094 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1095 ; SSE-NEXT: movdqa %xmm12, %xmm1
1096 ; SSE-NEXT: pand %xmm13, %xmm1
1097 ; SSE-NEXT: por %xmm0, %xmm1
1098 ; SSE-NEXT: movdqa %xmm1, %xmm0
1099 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1100 ; SSE-NEXT: movdqa %xmm6, %xmm10
1101 ; SSE-NEXT: pandn %xmm0, %xmm10
1102 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1103 ; SSE-NEXT: pand %xmm6, %xmm1
1104 ; SSE-NEXT: por %xmm10, %xmm1
1105 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1106 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1107 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
1108 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
1109 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1110 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1111 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1112 ; SSE-NEXT: packuswb %xmm0, %xmm1
1113 ; SSE-NEXT: movdqa 80(%rdi), %xmm10
1114 ; SSE-NEXT: movdqa %xmm10, %xmm12
1115 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
1116 ; SSE-NEXT: movdqa %xmm15, %xmm0
1117 ; SSE-NEXT: pandn %xmm12, %xmm0
1118 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1119 ; SSE-NEXT: movdqa %xmm10, %xmm11
1120 ; SSE-NEXT: pand %xmm15, %xmm11
1121 ; SSE-NEXT: por %xmm0, %xmm11
1122 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0]
1123 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1124 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
1125 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1126 ; SSE-NEXT: packuswb %xmm0, %xmm0
1127 ; SSE-NEXT: movdqa %xmm7, %xmm11
1128 ; SSE-NEXT: pandn %xmm0, %xmm11
1129 ; SSE-NEXT: pand %xmm7, %xmm1
1130 ; SSE-NEXT: por %xmm1, %xmm11
1131 ; SSE-NEXT: movdqa %xmm4, %xmm0
1132 ; SSE-NEXT: pand %xmm5, %xmm0
1133 ; SSE-NEXT: por %xmm14, %xmm0
1134 ; SSE-NEXT: movdqa %xmm0, %xmm1
1135 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
1136 ; SSE-NEXT: movdqa %xmm15, %xmm14
1137 ; SSE-NEXT: pandn %xmm1, %xmm14
1138 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1139 ; SSE-NEXT: pand %xmm15, %xmm0
1140 ; SSE-NEXT: por %xmm14, %xmm0
1141 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1142 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1143 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1144 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1145 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,5,6,7,4]
1146 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,7,6,7]
1147 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1148 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7]
1149 ; SSE-NEXT: packuswb %xmm0, %xmm14
1150 ; SSE-NEXT: movdqa %xmm15, %xmm0
1151 ; SSE-NEXT: pandn %xmm8, %xmm0
1152 ; SSE-NEXT: movdqa %xmm6, %xmm1
1153 ; SSE-NEXT: pandn %xmm3, %xmm1
1154 ; SSE-NEXT: pand %xmm15, %xmm3
1155 ; SSE-NEXT: por %xmm0, %xmm3
1156 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,3,2,3,4,5,6,7]
1157 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1158 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1159 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1160 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1161 ; SSE-NEXT: packuswb %xmm0, %xmm0
1162 ; SSE-NEXT: movdqa %xmm7, %xmm3
1163 ; SSE-NEXT: pandn %xmm0, %xmm3
1164 ; SSE-NEXT: pand %xmm7, %xmm14
1165 ; SSE-NEXT: por %xmm14, %xmm3
1166 ; SSE-NEXT: pand %xmm5, %xmm2
1167 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1168 ; SSE-NEXT: pandn %xmm8, %xmm5
1169 ; SSE-NEXT: por %xmm2, %xmm5
1170 ; SSE-NEXT: movdqa %xmm5, %xmm0
1171 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1172 ; SSE-NEXT: movdqa %xmm15, %xmm14
1173 ; SSE-NEXT: pandn %xmm0, %xmm14
1174 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1175 ; SSE-NEXT: pand %xmm15, %xmm5
1176 ; SSE-NEXT: por %xmm14, %xmm5
1177 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,0,3,4,5,6,7]
1178 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
1179 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
1180 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,3,0,4,5,6,7]
1181 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
1182 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1183 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1184 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7]
1185 ; SSE-NEXT: packuswb %xmm0, %xmm5
1186 ; SSE-NEXT: movdqa %xmm12, %xmm0
1187 ; SSE-NEXT: pand %xmm15, %xmm0
1188 ; SSE-NEXT: pandn %xmm10, %xmm15
1189 ; SSE-NEXT: por %xmm0, %xmm15
1190 ; SSE-NEXT: pand %xmm7, %xmm5
1191 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,3,2,3,4,5,6,7]
1192 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1193 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1194 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1195 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1196 ; SSE-NEXT: packuswb %xmm0, %xmm0
1197 ; SSE-NEXT: pandn %xmm0, %xmm7
1198 ; SSE-NEXT: por %xmm5, %xmm7
1199 ; SSE-NEXT: pand %xmm13, %xmm4
1200 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1201 ; SSE-NEXT: movdqa %xmm4, %xmm0
1202 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
1203 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,0,65535]
1204 ; SSE-NEXT: movdqa %xmm5, %xmm2
1205 ; SSE-NEXT: pandn %xmm0, %xmm2
1206 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1207 ; SSE-NEXT: pand %xmm5, %xmm4
1208 ; SSE-NEXT: por %xmm2, %xmm4
1209 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
1210 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
1211 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
1212 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
1213 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,4,7]
1214 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1215 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1216 ; SSE-NEXT: packuswb %xmm0, %xmm4
1217 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,0]
1218 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1219 ; SSE-NEXT: pand %xmm6, %xmm0
1220 ; SSE-NEXT: por %xmm1, %xmm0
1221 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1222 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1223 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1224 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
1225 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1226 ; SSE-NEXT: packuswb %xmm0, %xmm1
1227 ; SSE-NEXT: movdqa %xmm2, %xmm0
1228 ; SSE-NEXT: pandn %xmm1, %xmm0
1229 ; SSE-NEXT: pand %xmm2, %xmm4
1230 ; SSE-NEXT: por %xmm4, %xmm0
1231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1232 ; SSE-NEXT: pand %xmm13, %xmm1
1233 ; SSE-NEXT: pandn %xmm8, %xmm13
1234 ; SSE-NEXT: por %xmm1, %xmm13
1235 ; SSE-NEXT: movdqa %xmm13, %xmm1
1236 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
1237 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
1238 ; SSE-NEXT: pand %xmm5, %xmm13
1239 ; SSE-NEXT: pandn %xmm1, %xmm5
1240 ; SSE-NEXT: por %xmm13, %xmm5
1241 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0]
1242 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
1243 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,0]
1244 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,0,3,4,5,6,7]
1245 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
1246 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1247 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1248 ; SSE-NEXT: packuswb %xmm1, %xmm4
1249 ; SSE-NEXT: pand %xmm6, %xmm10
1250 ; SSE-NEXT: pandn %xmm12, %xmm6
1251 ; SSE-NEXT: por %xmm10, %xmm6
1252 ; SSE-NEXT: pand %xmm2, %xmm4
1253 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3]
1254 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1255 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1256 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
1257 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1258 ; SSE-NEXT: packuswb %xmm1, %xmm1
1259 ; SSE-NEXT: pandn %xmm1, %xmm2
1260 ; SSE-NEXT: por %xmm4, %xmm2
1261 ; SSE-NEXT: movdqa %xmm11, 16(%rsi)
1262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1263 ; SSE-NEXT: movaps %xmm1, (%rsi)
1264 ; SSE-NEXT: movdqa %xmm7, 16(%rdx)
1265 ; SSE-NEXT: movdqa %xmm3, (%rdx)
1266 ; SSE-NEXT: movdqa %xmm2, 16(%rcx)
1267 ; SSE-NEXT: movdqa %xmm0, (%rcx)
1270 ; AVX-LABEL: load_i8_stride3_vf32:
1272 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1273 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1274 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
1275 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
1276 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm4
1277 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm5
1278 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1279 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0
1280 ; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1281 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1
1282 ; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4
1283 ; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm2
1284 ; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1285 ; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
1286 ; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
1287 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1288 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1289 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm8
1290 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
1291 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
1292 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2
1293 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
1294 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10]
1295 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1296 ; AVX-NEXT: # ymm5 = mem[0,1,0,1]
1297 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
1298 ; AVX-NEXT: vandps %ymm5, %ymm8, %ymm5
1299 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm2
1300 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7,8,9,10]
1301 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1302 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
1303 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
1304 ; AVX-NEXT: vmovdqa %xmm3, 16(%rsi)
1305 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
1306 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
1307 ; AVX-NEXT: vmovdqa %xmm4, 16(%rcx)
1308 ; AVX-NEXT: vmovdqa %xmm1, (%rcx)
1309 ; AVX-NEXT: vzeroupper
1312 ; AVX2-LABEL: load_i8_stride3_vf32:
1314 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1315 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
1316 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
1317 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1318 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1319 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1320 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1321 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
1322 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1323 ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1324 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1325 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1326 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1327 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1328 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1329 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1330 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
1331 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
1332 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1333 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1334 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
1335 ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
1336 ; AVX2-NEXT: vmovdqa %ymm2, (%rcx)
1337 ; AVX2-NEXT: vzeroupper
1340 ; AVX2-FP-LABEL: load_i8_stride3_vf32:
1342 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
1343 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
1344 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
1345 ; AVX2-FP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1346 ; AVX2-FP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1347 ; AVX2-FP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1348 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1349 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
1350 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1351 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1352 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1353 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1354 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1355 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1356 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1357 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1358 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1]
1359 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
1360 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1361 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1362 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi)
1363 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx)
1364 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx)
1365 ; AVX2-FP-NEXT: vzeroupper
1366 ; AVX2-FP-NEXT: retq
1368 ; AVX2-FCP-LABEL: load_i8_stride3_vf32:
1369 ; AVX2-FCP: # %bb.0:
1370 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
1371 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1372 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1373 ; AVX2-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1374 ; AVX2-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1375 ; AVX2-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1376 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1377 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1378 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1379 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1380 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1381 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1382 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1383 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1384 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1385 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1386 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
1387 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
1388 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1389 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1390 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1391 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1392 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx)
1393 ; AVX2-FCP-NEXT: vzeroupper
1394 ; AVX2-FCP-NEXT: retq
1396 ; AVX512-LABEL: load_i8_stride3_vf32:
1398 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
1399 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
1400 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
1401 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1402 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1403 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1404 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1405 ; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
1406 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1407 ; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1408 ; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1409 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1410 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1411 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1412 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1413 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1414 ; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
1415 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1416 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1417 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1418 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
1419 ; AVX512-NEXT: vmovdqa %ymm4, (%rdx)
1420 ; AVX512-NEXT: vmovdqa %ymm2, (%rcx)
1421 ; AVX512-NEXT: vzeroupper
1424 ; AVX512-FCP-LABEL: load_i8_stride3_vf32:
1425 ; AVX512-FCP: # %bb.0:
1426 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
1427 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1428 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1429 ; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1430 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1431 ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1432 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1433 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1434 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1435 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1436 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1437 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1438 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1439 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1440 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1441 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1442 ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
1443 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1444 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1445 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1446 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1447 ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx)
1448 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx)
1449 ; AVX512-FCP-NEXT: vzeroupper
1450 ; AVX512-FCP-NEXT: retq
1452 ; AVX512DQ-LABEL: load_i8_stride3_vf32:
1453 ; AVX512DQ: # %bb.0:
1454 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
1455 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
1456 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
1457 ; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1458 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1459 ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1460 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1461 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
1462 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1463 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1464 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1465 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1466 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1467 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1468 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1469 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1470 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
1471 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1472 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1473 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1474 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
1475 ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rdx)
1476 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx)
1477 ; AVX512DQ-NEXT: vzeroupper
1478 ; AVX512DQ-NEXT: retq
1480 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf32:
1481 ; AVX512DQ-FCP: # %bb.0:
1482 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
1483 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1484 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1485 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1486 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1487 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1488 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1489 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1490 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1491 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1492 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1493 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1494 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1495 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1496 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1497 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1498 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
1499 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1500 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1501 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1502 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1503 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx)
1504 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx)
1505 ; AVX512DQ-FCP-NEXT: vzeroupper
1506 ; AVX512DQ-FCP-NEXT: retq
1508 ; AVX512BW-LABEL: load_i8_stride3_vf32:
1509 ; AVX512BW: # %bb.0:
1510 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1511 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
1512 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
1513 ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1514 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1515 ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1516 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1517 ; AVX512BW-NEXT: # ymm3 = mem[0,1,0,1]
1518 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1519 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1520 ; AVX512BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1521 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1522 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1523 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1524 ; AVX512BW-NEXT: movl $-134154240, %eax # imm = 0xF800F800
1525 ; AVX512BW-NEXT: kmovd %eax, %k1
1526 ; AVX512BW-NEXT: vpblendmb %ymm1, %ymm0, %ymm2 {%k1}
1527 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1528 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1529 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1530 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
1531 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rdx)
1532 ; AVX512BW-NEXT: vmovdqa %ymm1, (%rcx)
1533 ; AVX512BW-NEXT: vzeroupper
1534 ; AVX512BW-NEXT: retq
1536 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf32:
1537 ; AVX512BW-FCP: # %bb.0:
1538 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1539 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1540 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1541 ; AVX512BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1542 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1543 ; AVX512BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1544 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1545 ; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1546 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1547 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1548 ; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1549 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1550 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1551 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1552 ; AVX512BW-FCP-NEXT: movl $-134154240, %eax # imm = 0xF800F800
1553 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
1554 ; AVX512BW-FCP-NEXT: vpblendmb %ymm1, %ymm0, %ymm2 {%k1}
1555 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1556 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1557 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1558 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1559 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rdx)
1560 ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rcx)
1561 ; AVX512BW-FCP-NEXT: vzeroupper
1562 ; AVX512BW-FCP-NEXT: retq
1564 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf32:
1565 ; AVX512DQ-BW: # %bb.0:
1566 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
1567 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
1568 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
1569 ; AVX512DQ-BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1570 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1571 ; AVX512DQ-BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1572 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1573 ; AVX512DQ-BW-NEXT: # ymm3 = mem[0,1,0,1]
1574 ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1575 ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1576 ; AVX512DQ-BW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1577 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1578 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1579 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1580 ; AVX512DQ-BW-NEXT: movl $-134154240, %eax # imm = 0xF800F800
1581 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
1582 ; AVX512DQ-BW-NEXT: vpblendmb %ymm1, %ymm0, %ymm2 {%k1}
1583 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1584 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1585 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1586 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
1587 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rdx)
1588 ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rcx)
1589 ; AVX512DQ-BW-NEXT: vzeroupper
1590 ; AVX512DQ-BW-NEXT: retq
1592 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf32:
1593 ; AVX512DQ-BW-FCP: # %bb.0:
1594 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1597 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
1598 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
1599 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
1600 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
1601 ; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
1602 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
1603 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
1604 ; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
1605 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
1606 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1607 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
1608 ; AVX512DQ-BW-FCP-NEXT: movl $-134154240, %eax # imm = 0xF800F800
1609 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
1610 ; AVX512DQ-BW-FCP-NEXT: vpblendmb %ymm1, %ymm0, %ymm2 {%k1}
1611 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
1612 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
1613 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
1614 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1615 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rdx)
1616 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rcx)
1617 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1618 ; AVX512DQ-BW-FCP-NEXT: retq
1619 %wide.vec = load <96 x i8>, ptr %in.vec, align 64
1620 %strided.vec0 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93>
1621 %strided.vec1 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94>
1622 %strided.vec2 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95>
1623 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
1624 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
1625 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
1629 define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind {
1630 ; SSE-LABEL: load_i8_stride3_vf64:
1632 ; SSE-NEXT: subq $168, %rsp
1633 ; SSE-NEXT: movdqa 80(%rdi), %xmm9
1634 ; SSE-NEXT: movdqa (%rdi), %xmm0
1635 ; SSE-NEXT: movdqa 16(%rdi), %xmm10
1636 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1637 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
1638 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
1639 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1640 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
1641 ; SSE-NEXT: movdqa %xmm2, %xmm6
1642 ; SSE-NEXT: pandn %xmm1, %xmm6
1643 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
1644 ; SSE-NEXT: movdqa %xmm4, %xmm1
1645 ; SSE-NEXT: pandn %xmm5, %xmm1
1646 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1647 ; SSE-NEXT: movdqa %xmm2, %xmm1
1648 ; SSE-NEXT: pandn %xmm5, %xmm1
1649 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1650 ; SSE-NEXT: pand %xmm2, %xmm5
1651 ; SSE-NEXT: movdqa %xmm2, %xmm11
1652 ; SSE-NEXT: por %xmm6, %xmm5
1653 ; SSE-NEXT: pxor %xmm8, %xmm8
1654 ; SSE-NEXT: movdqa %xmm5, %xmm1
1655 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1656 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
1657 ; SSE-NEXT: movdqa %xmm6, %xmm2
1658 ; SSE-NEXT: pandn %xmm1, %xmm2
1659 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1660 ; SSE-NEXT: pand %xmm6, %xmm5
1661 ; SSE-NEXT: por %xmm2, %xmm5
1662 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3]
1663 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
1664 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
1665 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
1666 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
1667 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1668 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1669 ; SSE-NEXT: packuswb %xmm1, %xmm2
1670 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1671 ; SSE-NEXT: pand %xmm7, %xmm2
1672 ; SSE-NEXT: movdqa %xmm9, %xmm13
1673 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
1674 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,0,65535,65535]
1675 ; SSE-NEXT: movdqa %xmm12, %xmm1
1676 ; SSE-NEXT: pandn %xmm13, %xmm1
1677 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1678 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1679 ; SSE-NEXT: pand %xmm12, %xmm9
1680 ; SSE-NEXT: por %xmm1, %xmm9
1681 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,0]
1682 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1683 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
1684 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1685 ; SSE-NEXT: packuswb %xmm1, %xmm1
1686 ; SSE-NEXT: movdqa %xmm7, %xmm3
1687 ; SSE-NEXT: pandn %xmm1, %xmm3
1688 ; SSE-NEXT: por %xmm2, %xmm3
1689 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1690 ; SSE-NEXT: movdqa %xmm11, %xmm1
1691 ; SSE-NEXT: pandn %xmm10, %xmm1
1692 ; SSE-NEXT: movdqa %xmm4, %xmm2
1693 ; SSE-NEXT: pandn %xmm0, %xmm2
1694 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1695 ; SSE-NEXT: movdqa %xmm11, %xmm2
1696 ; SSE-NEXT: pandn %xmm0, %xmm2
1697 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1698 ; SSE-NEXT: pand %xmm11, %xmm0
1699 ; SSE-NEXT: por %xmm1, %xmm0
1700 ; SSE-NEXT: movdqa %xmm0, %xmm1
1701 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1702 ; SSE-NEXT: movdqa %xmm6, %xmm2
1703 ; SSE-NEXT: pandn %xmm1, %xmm2
1704 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1705 ; SSE-NEXT: pand %xmm6, %xmm0
1706 ; SSE-NEXT: por %xmm2, %xmm0
1707 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1708 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1709 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1710 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1711 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1712 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1713 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1714 ; SSE-NEXT: packuswb %xmm1, %xmm0
1715 ; SSE-NEXT: movdqa 32(%rdi), %xmm15
1716 ; SSE-NEXT: movdqa %xmm15, %xmm10
1717 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
1718 ; SSE-NEXT: movdqa %xmm12, %xmm1
1719 ; SSE-NEXT: pandn %xmm10, %xmm1
1720 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
1721 ; SSE-NEXT: movdqa %xmm15, %xmm2
1722 ; SSE-NEXT: pand %xmm12, %xmm2
1723 ; SSE-NEXT: por %xmm1, %xmm2
1724 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0]
1725 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1726 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,0]
1727 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1728 ; SSE-NEXT: packuswb %xmm1, %xmm1
1729 ; SSE-NEXT: movdqa %xmm7, %xmm2
1730 ; SSE-NEXT: pandn %xmm1, %xmm2
1731 ; SSE-NEXT: pand %xmm7, %xmm0
1732 ; SSE-NEXT: por %xmm0, %xmm2
1733 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1734 ; SSE-NEXT: movdqa 160(%rdi), %xmm14
1735 ; SSE-NEXT: movdqa %xmm11, %xmm0
1736 ; SSE-NEXT: pandn %xmm14, %xmm0
1737 ; SSE-NEXT: movdqa 144(%rdi), %xmm1
1738 ; SSE-NEXT: movdqa %xmm4, %xmm2
1739 ; SSE-NEXT: pandn %xmm1, %xmm2
1740 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
1741 ; SSE-NEXT: movdqa %xmm11, %xmm5
1742 ; SSE-NEXT: movdqa %xmm11, %xmm2
1743 ; SSE-NEXT: pandn %xmm1, %xmm2
1744 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; SSE-NEXT: pand %xmm11, %xmm1
1746 ; SSE-NEXT: por %xmm0, %xmm1
1747 ; SSE-NEXT: movdqa %xmm1, %xmm0
1748 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1749 ; SSE-NEXT: movdqa %xmm6, %xmm2
1750 ; SSE-NEXT: pandn %xmm0, %xmm2
1751 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1752 ; SSE-NEXT: pand %xmm6, %xmm1
1753 ; SSE-NEXT: por %xmm2, %xmm1
1754 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1755 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1756 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
1757 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
1758 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,5,4,7]
1759 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1760 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1761 ; SSE-NEXT: packuswb %xmm0, %xmm9
1762 ; SSE-NEXT: movdqa 176(%rdi), %xmm3
1763 ; SSE-NEXT: movdqa %xmm3, %xmm1
1764 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1765 ; SSE-NEXT: movdqa %xmm12, %xmm0
1766 ; SSE-NEXT: pandn %xmm1, %xmm0
1767 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
1768 ; SSE-NEXT: movdqa %xmm3, %xmm11
1769 ; SSE-NEXT: pand %xmm12, %xmm11
1770 ; SSE-NEXT: por %xmm0, %xmm11
1771 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0]
1772 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1773 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
1774 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1775 ; SSE-NEXT: packuswb %xmm0, %xmm0
1776 ; SSE-NEXT: movdqa %xmm7, %xmm2
1777 ; SSE-NEXT: pandn %xmm0, %xmm2
1778 ; SSE-NEXT: pand %xmm7, %xmm9
1779 ; SSE-NEXT: por %xmm9, %xmm2
1780 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1781 ; SSE-NEXT: movdqa 112(%rdi), %xmm9
1782 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1783 ; SSE-NEXT: movdqa %xmm5, %xmm0
1784 ; SSE-NEXT: pandn %xmm9, %xmm0
1785 ; SSE-NEXT: movdqa 96(%rdi), %xmm9
1786 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1787 ; SSE-NEXT: pand %xmm5, %xmm9
1788 ; SSE-NEXT: por %xmm0, %xmm9
1789 ; SSE-NEXT: movdqa %xmm9, %xmm0
1790 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1791 ; SSE-NEXT: movdqa %xmm6, %xmm11
1792 ; SSE-NEXT: pandn %xmm0, %xmm11
1793 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1794 ; SSE-NEXT: pand %xmm6, %xmm9
1795 ; SSE-NEXT: por %xmm11, %xmm9
1796 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,1,3]
1797 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
1798 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
1799 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,1,4,5,6,7]
1800 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
1801 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1802 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1803 ; SSE-NEXT: packuswb %xmm0, %xmm9
1804 ; SSE-NEXT: movdqa 128(%rdi), %xmm2
1805 ; SSE-NEXT: movdqa %xmm2, %xmm11
1806 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
1807 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1808 ; SSE-NEXT: movdqa %xmm12, %xmm0
1809 ; SSE-NEXT: pandn %xmm11, %xmm0
1810 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
1811 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1812 ; SSE-NEXT: movdqa %xmm2, %xmm11
1813 ; SSE-NEXT: pand %xmm12, %xmm11
1814 ; SSE-NEXT: por %xmm0, %xmm11
1815 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,1,2,0]
1816 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1817 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
1818 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1819 ; SSE-NEXT: packuswb %xmm0, %xmm0
1820 ; SSE-NEXT: movdqa %xmm7, %xmm2
1821 ; SSE-NEXT: pandn %xmm0, %xmm2
1822 ; SSE-NEXT: pand %xmm7, %xmm9
1823 ; SSE-NEXT: por %xmm9, %xmm2
1824 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1825 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1826 ; SSE-NEXT: pand %xmm4, %xmm0
1827 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1828 ; SSE-NEXT: movdqa %xmm0, %xmm9
1829 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
1830 ; SSE-NEXT: movdqa %xmm12, %xmm11
1831 ; SSE-NEXT: pandn %xmm9, %xmm11
1832 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1833 ; SSE-NEXT: pand %xmm12, %xmm0
1834 ; SSE-NEXT: por %xmm11, %xmm0
1835 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1836 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1837 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1838 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1839 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1840 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
1841 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
1842 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7]
1843 ; SSE-NEXT: packuswb %xmm9, %xmm0
1844 ; SSE-NEXT: movdqa %xmm12, %xmm9
1845 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1846 ; SSE-NEXT: pandn %xmm5, %xmm9
1847 ; SSE-NEXT: movdqa %xmm6, %xmm2
1848 ; SSE-NEXT: pandn %xmm13, %xmm2
1849 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1850 ; SSE-NEXT: pand %xmm12, %xmm13
1851 ; SSE-NEXT: por %xmm9, %xmm13
1852 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm13[0,3,2,3,4,5,6,7]
1853 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
1854 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
1855 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
1856 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
1857 ; SSE-NEXT: packuswb %xmm9, %xmm9
1858 ; SSE-NEXT: movdqa %xmm7, %xmm2
1859 ; SSE-NEXT: pandn %xmm9, %xmm2
1860 ; SSE-NEXT: pand %xmm7, %xmm0
1861 ; SSE-NEXT: por %xmm0, %xmm2
1862 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1863 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1864 ; SSE-NEXT: pand %xmm4, %xmm0
1865 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1866 ; SSE-NEXT: movdqa %xmm0, %xmm9
1867 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15]
1868 ; SSE-NEXT: movdqa %xmm12, %xmm11
1869 ; SSE-NEXT: pandn %xmm9, %xmm11
1870 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1871 ; SSE-NEXT: pand %xmm12, %xmm0
1872 ; SSE-NEXT: por %xmm11, %xmm0
1873 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1874 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1875 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1876 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1877 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1878 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
1879 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
1880 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,3,4,5,6,7]
1881 ; SSE-NEXT: packuswb %xmm9, %xmm0
1882 ; SSE-NEXT: movdqa %xmm12, %xmm11
1883 ; SSE-NEXT: pandn %xmm15, %xmm11
1884 ; SSE-NEXT: movdqa %xmm6, %xmm9
1885 ; SSE-NEXT: pandn %xmm10, %xmm9
1886 ; SSE-NEXT: pand %xmm12, %xmm10
1887 ; SSE-NEXT: por %xmm11, %xmm10
1888 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
1889 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7]
1890 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
1891 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
1892 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
1893 ; SSE-NEXT: packuswb %xmm10, %xmm11
1894 ; SSE-NEXT: movdqa %xmm7, %xmm10
1895 ; SSE-NEXT: pandn %xmm11, %xmm10
1896 ; SSE-NEXT: pand %xmm7, %xmm0
1897 ; SSE-NEXT: por %xmm0, %xmm10
1898 ; SSE-NEXT: movdqa %xmm14, %xmm0
1899 ; SSE-NEXT: pand %xmm4, %xmm0
1900 ; SSE-NEXT: por (%rsp), %xmm0 # 16-byte Folded Reload
1901 ; SSE-NEXT: movdqa %xmm0, %xmm11
1902 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
1903 ; SSE-NEXT: movdqa %xmm12, %xmm13
1904 ; SSE-NEXT: pandn %xmm11, %xmm13
1905 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1906 ; SSE-NEXT: pand %xmm12, %xmm0
1907 ; SSE-NEXT: por %xmm13, %xmm0
1908 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1909 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1910 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1911 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1912 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1913 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
1914 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
1915 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,2,3,3,4,5,6,7]
1916 ; SSE-NEXT: packuswb %xmm11, %xmm0
1917 ; SSE-NEXT: movdqa %xmm12, %xmm13
1918 ; SSE-NEXT: pandn %xmm3, %xmm13
1919 ; SSE-NEXT: movdqa %xmm6, %xmm11
1920 ; SSE-NEXT: pandn %xmm1, %xmm11
1921 ; SSE-NEXT: pand %xmm12, %xmm1
1922 ; SSE-NEXT: por %xmm13, %xmm1
1923 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1924 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1925 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1926 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
1927 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
1928 ; SSE-NEXT: packuswb %xmm1, %xmm1
1929 ; SSE-NEXT: movdqa %xmm7, %xmm13
1930 ; SSE-NEXT: pandn %xmm1, %xmm13
1931 ; SSE-NEXT: pand %xmm7, %xmm0
1932 ; SSE-NEXT: por %xmm0, %xmm13
1933 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1934 ; SSE-NEXT: pand %xmm4, %xmm0
1935 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1936 ; SSE-NEXT: por %xmm0, %xmm4
1937 ; SSE-NEXT: movdqa %xmm4, %xmm0
1938 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1939 ; SSE-NEXT: movdqa %xmm12, %xmm1
1940 ; SSE-NEXT: pandn %xmm0, %xmm1
1941 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
1942 ; SSE-NEXT: pand %xmm12, %xmm4
1943 ; SSE-NEXT: por %xmm1, %xmm4
1944 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,1,0,3,4,5,6,7]
1945 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1946 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
1947 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1948 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
1949 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1950 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1951 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,3,4,5,6,7]
1952 ; SSE-NEXT: packuswb %xmm0, %xmm1
1953 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1954 ; SSE-NEXT: pand %xmm12, %xmm0
1955 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1956 ; SSE-NEXT: por %xmm0, %xmm12
1957 ; SSE-NEXT: pand %xmm7, %xmm1
1958 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7]
1959 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1960 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1961 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
1962 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
1963 ; SSE-NEXT: packuswb %xmm0, %xmm0
1964 ; SSE-NEXT: pandn %xmm0, %xmm7
1965 ; SSE-NEXT: por %xmm1, %xmm7
1966 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1967 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
1968 ; SSE-NEXT: pand %xmm2, %xmm0
1969 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1970 ; SSE-NEXT: movdqa %xmm0, %xmm4
1971 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
1972 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,0,65535,65535,0,65535]
1973 ; SSE-NEXT: movdqa %xmm12, %xmm1
1974 ; SSE-NEXT: pandn %xmm4, %xmm1
1975 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1976 ; SSE-NEXT: pand %xmm12, %xmm0
1977 ; SSE-NEXT: por %xmm1, %xmm0
1978 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0]
1979 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1980 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
1981 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
1982 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,4,7]
1983 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,1,2,3]
1984 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1985 ; SSE-NEXT: packuswb %xmm1, %xmm0
1986 ; SSE-NEXT: movdqa %xmm5, %xmm1
1987 ; SSE-NEXT: pand %xmm6, %xmm1
1988 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1989 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1990 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
1991 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1992 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
1993 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1994 ; SSE-NEXT: packuswb %xmm1, %xmm1
1995 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
1996 ; SSE-NEXT: movdqa %xmm4, %xmm5
1997 ; SSE-NEXT: pandn %xmm1, %xmm5
1998 ; SSE-NEXT: pand %xmm4, %xmm0
1999 ; SSE-NEXT: por %xmm0, %xmm5
2000 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2001 ; SSE-NEXT: pand %xmm2, %xmm0
2002 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2003 ; SSE-NEXT: movdqa %xmm0, %xmm2
2004 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
2005 ; SSE-NEXT: movdqa %xmm12, %xmm1
2006 ; SSE-NEXT: pandn %xmm2, %xmm1
2007 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
2008 ; SSE-NEXT: pand %xmm12, %xmm0
2009 ; SSE-NEXT: por %xmm1, %xmm0
2010 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,0]
2011 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2012 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
2013 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
2014 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2015 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
2016 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2017 ; SSE-NEXT: packuswb %xmm0, %xmm1
2018 ; SSE-NEXT: pand %xmm6, %xmm15
2019 ; SSE-NEXT: por %xmm9, %xmm15
2020 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,2,1,3]
2021 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
2022 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2023 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
2024 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2025 ; SSE-NEXT: packuswb %xmm0, %xmm0
2026 ; SSE-NEXT: movdqa %xmm4, %xmm9
2027 ; SSE-NEXT: pandn %xmm0, %xmm9
2028 ; SSE-NEXT: pand %xmm4, %xmm1
2029 ; SSE-NEXT: por %xmm1, %xmm9
2030 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255]
2031 ; SSE-NEXT: pand %xmm2, %xmm14
2032 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2033 ; SSE-NEXT: movdqa %xmm14, %xmm0
2034 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
2035 ; SSE-NEXT: movdqa %xmm12, %xmm1
2036 ; SSE-NEXT: pandn %xmm0, %xmm1
2037 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
2038 ; SSE-NEXT: pand %xmm12, %xmm14
2039 ; SSE-NEXT: por %xmm1, %xmm14
2040 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,0]
2041 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2042 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
2043 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
2044 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2045 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
2046 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2047 ; SSE-NEXT: packuswb %xmm0, %xmm1
2048 ; SSE-NEXT: pand %xmm6, %xmm3
2049 ; SSE-NEXT: por %xmm11, %xmm3
2050 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3]
2051 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
2052 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2053 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
2054 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
2055 ; SSE-NEXT: packuswb %xmm0, %xmm3
2056 ; SSE-NEXT: movdqa %xmm4, %xmm0
2057 ; SSE-NEXT: pandn %xmm3, %xmm0
2058 ; SSE-NEXT: pand %xmm4, %xmm1
2059 ; SSE-NEXT: por %xmm1, %xmm0
2060 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2061 ; SSE-NEXT: pand %xmm2, %xmm1
2062 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2063 ; SSE-NEXT: por %xmm1, %xmm2
2064 ; SSE-NEXT: movdqa %xmm2, %xmm1
2065 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
2066 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2067 ; SSE-NEXT: pand %xmm12, %xmm2
2068 ; SSE-NEXT: pandn %xmm1, %xmm12
2069 ; SSE-NEXT: por %xmm2, %xmm12
2070 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[3,1,2,0]
2071 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
2072 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
2073 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,0,3,4,5,6,7]
2074 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
2075 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
2076 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2077 ; SSE-NEXT: packuswb %xmm1, %xmm3
2078 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2079 ; SSE-NEXT: pand %xmm6, %xmm1
2080 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2081 ; SSE-NEXT: por %xmm1, %xmm6
2082 ; SSE-NEXT: pand %xmm4, %xmm3
2083 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,1,3]
2084 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
2085 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2086 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
2087 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2088 ; SSE-NEXT: packuswb %xmm1, %xmm1
2089 ; SSE-NEXT: pandn %xmm1, %xmm4
2090 ; SSE-NEXT: por %xmm3, %xmm4
2091 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2092 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
2093 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2094 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
2095 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2096 ; SSE-NEXT: movaps %xmm1, (%rsi)
2097 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2098 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
2099 ; SSE-NEXT: movdqa %xmm7, 32(%rdx)
2100 ; SSE-NEXT: movdqa %xmm13, 48(%rdx)
2101 ; SSE-NEXT: movdqa %xmm10, (%rdx)
2102 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2103 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
2104 ; SSE-NEXT: movdqa %xmm4, 32(%rcx)
2105 ; SSE-NEXT: movdqa %xmm0, 48(%rcx)
2106 ; SSE-NEXT: movdqa %xmm9, (%rcx)
2107 ; SSE-NEXT: movdqa %xmm5, 16(%rcx)
2108 ; SSE-NEXT: addq $168, %rsp
2111 ; AVX-LABEL: load_i8_stride3_vf64:
2113 ; AVX-NEXT: vmovdqa (%rdi), %xmm6
2114 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
2115 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2116 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
2117 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
2118 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
2119 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2120 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
2121 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm3
2122 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm10
2123 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
2124 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2125 ; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
2126 ; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm5
2127 ; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm8
2128 ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm9
2129 ; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm13
2130 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
2131 ; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm6
2132 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
2133 ; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm12
2134 ; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6
2135 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2136 ; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10
2137 ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm12
2138 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm6
2139 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2140 ; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm11
2141 ; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm12
2142 ; AVX-NEXT: vpor %xmm11, %xmm12, %xmm6
2143 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2144 ; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm7
2145 ; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm12
2146 ; AVX-NEXT: vpor %xmm7, %xmm12, %xmm6
2147 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2148 ; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm12
2149 ; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm6
2150 ; AVX-NEXT: vpor %xmm6, %xmm12, %xmm6
2151 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm12
2152 ; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0
2153 ; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm10
2154 ; AVX-NEXT: vpor %xmm0, %xmm10, %xmm1
2155 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm10
2156 ; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm0
2157 ; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm11
2158 ; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0
2159 ; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm11
2160 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm14
2161 ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm15
2162 ; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11
2163 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
2164 ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm7
2165 ; AVX-NEXT: vpor %xmm7, %xmm13, %xmm7
2166 ; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7,8,9,10]
2167 ; AVX-NEXT: vpshufb %xmm15, %xmm10, %xmm11
2168 ; AVX-NEXT: vpor %xmm11, %xmm9, %xmm11
2169 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
2170 ; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm0
2171 ; AVX-NEXT: vpor %xmm0, %xmm8, %xmm2
2172 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
2173 ; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm1
2174 ; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1
2175 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
2176 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
2177 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2178 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm15
2179 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
2180 ; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14
2181 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2182 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2183 ; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15
2184 ; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm10
2185 ; AVX-NEXT: vpor %xmm10, %xmm15, %xmm10
2186 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2187 ; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15
2188 ; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm12
2189 ; AVX-NEXT: vpor %xmm12, %xmm15, %xmm12
2190 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2191 ; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm6
2192 ; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
2193 ; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0
2194 ; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
2195 ; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm6
2196 ; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
2197 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2198 ; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm15
2199 ; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6
2200 ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm11
2201 ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm15
2202 ; AVX-NEXT: vpor %xmm15, %xmm11, %xmm11
2203 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
2204 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2205 ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
2206 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
2207 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
2208 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2209 ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
2210 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
2211 ; AVX-NEXT: vmovdqa %xmm1, (%rsi)
2212 ; AVX-NEXT: vmovdqa %xmm2, 48(%rsi)
2213 ; AVX-NEXT: vmovdqa %xmm11, 32(%rsi)
2214 ; AVX-NEXT: vmovdqa %xmm6, 16(%rsi)
2215 ; AVX-NEXT: vmovdqa %xmm0, (%rdx)
2216 ; AVX-NEXT: vmovdqa %xmm12, 48(%rdx)
2217 ; AVX-NEXT: vmovdqa %xmm10, 32(%rdx)
2218 ; AVX-NEXT: vmovdqa %xmm14, 16(%rdx)
2219 ; AVX-NEXT: vmovdqa %xmm5, (%rcx)
2220 ; AVX-NEXT: vmovdqa %xmm8, 48(%rcx)
2221 ; AVX-NEXT: vmovdqa %xmm9, 32(%rcx)
2222 ; AVX-NEXT: vmovdqa %xmm13, 16(%rcx)
2225 ; AVX2-LABEL: load_i8_stride3_vf64:
2227 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
2228 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
2229 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
2230 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3
2231 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm4
2232 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm5
2233 ; AVX2-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6
2234 ; AVX2-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0
2235 ; AVX2-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm1
2236 ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2
2237 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2238 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4
2239 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
2240 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
2241 ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7
2242 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
2243 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
2244 ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7
2245 ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5
2246 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
2247 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255]
2248 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
2249 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
2250 ; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
2251 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9
2252 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
2253 ; AVX2-NEXT: # ymm10 = mem[0,1,0,1]
2254 ; AVX2-NEXT: vpshufb %ymm10, %ymm9, %ymm9
2255 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8
2256 ; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm8
2257 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26]
2258 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26]
2259 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128]
2260 ; AVX2-NEXT: vpshufb %ymm10, %ymm2, %ymm2
2261 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30]
2262 ; AVX2-NEXT: vpshufb %ymm11, %ymm4, %ymm4
2263 ; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2
2264 ; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm4
2265 ; AVX2-NEXT: vpshufb %ymm11, %ymm1, %ymm1
2266 ; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm1
2267 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2268 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
2269 ; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3
2270 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
2271 ; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
2272 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
2273 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2274 ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
2275 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
2276 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
2277 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi)
2278 ; AVX2-NEXT: vmovdqa %ymm1, (%rdx)
2279 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx)
2280 ; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
2281 ; AVX2-NEXT: vmovdqa %ymm8, 32(%rcx)
2282 ; AVX2-NEXT: vzeroupper
2285 ; AVX2-FP-LABEL: load_i8_stride3_vf64:
2287 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
2288 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
2289 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
2290 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3
2291 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm4
2292 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5
2293 ; AVX2-FP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6
2294 ; AVX2-FP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0
2295 ; AVX2-FP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm1
2296 ; AVX2-FP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2
2297 ; AVX2-FP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2298 ; AVX2-FP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4
2299 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
2300 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
2301 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7
2302 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
2303 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
2304 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm7
2305 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5
2306 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
2307 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255]
2308 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
2309 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
2310 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
2311 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9
2312 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
2313 ; AVX2-FP-NEXT: # ymm10 = mem[0,1,0,1]
2314 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
2315 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8
2316 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
2317 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26]
2318 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26]
2319 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128]
2320 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
2321 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30]
2322 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
2323 ; AVX2-FP-NEXT: vpor %ymm4, %ymm2, %ymm2
2324 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm4
2325 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
2326 ; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1
2327 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2328 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
2329 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
2330 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
2331 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,1]
2332 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
2333 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2334 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
2335 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
2336 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi)
2337 ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rsi)
2338 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx)
2339 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx)
2340 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx)
2341 ; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%rcx)
2342 ; AVX2-FP-NEXT: vzeroupper
2343 ; AVX2-FP-NEXT: retq
2345 ; AVX2-FCP-LABEL: load_i8_stride3_vf64:
2346 ; AVX2-FCP: # %bb.0:
2347 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
2348 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2349 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
2350 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2351 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2352 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2353 ; AVX2-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm6
2354 ; AVX2-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm0
2355 ; AVX2-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm1
2356 ; AVX2-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2
2357 ; AVX2-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2358 ; AVX2-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4
2359 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255]
2360 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
2361 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7
2362 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14]
2363 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
2364 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7
2365 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5
2366 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
2367 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255]
2368 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
2369 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6
2370 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
2371 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9
2372 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0,1,4,7,10,13,0,3,6,9,12,15,0,0,0,0,0]
2373 ; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1]
2374 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
2375 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm8
2376 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
2377 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm5[11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23,24,25,26]
2378 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm7[11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23,24,25,26]
2379 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [1,4,7,10,13,0,3,6,9,12,15,128,128,128,128,128,17,20,23,26,29,16,19,22,25,28,31,128,128,128,128,128]
2380 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
2381 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128,128,128,128,128,128,128,18,21,24,27,30]
2382 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
2383 ; AVX2-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
2384 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm4
2385 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
2386 ; AVX2-FCP-NEXT: vpor %ymm1, %ymm4, %ymm1
2387 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2388 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5,8,11,14,0,0,0,2,5]
2389 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
2390 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
2391 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1]
2392 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
2393 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
2394 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
2395 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
2396 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi)
2397 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rsi)
2398 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx)
2399 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx)
2400 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx)
2401 ; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%rcx)
2402 ; AVX2-FCP-NEXT: vzeroupper
2403 ; AVX2-FCP-NEXT: retq
2405 ; AVX512-LABEL: load_i8_stride3_vf64:
2407 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
2408 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
2409 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
2410 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
2411 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
2412 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
2413 ; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2414 ; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2415 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2416 ; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2417 ; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2418 ; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2419 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2420 ; AVX512-NEXT: # ymm6 = mem[0,1,0,1]
2421 ; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2422 ; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2423 ; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2424 ; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2425 ; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2426 ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2427 ; AVX512-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2428 ; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
2429 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2430 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2431 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8
2432 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
2433 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
2434 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2
2435 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2436 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2437 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2438 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2439 ; AVX512-NEXT: vpternlogq $202, %zmm2, %zmm8, %zmm5
2440 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
2441 ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2442 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
2443 ; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2444 ; AVX512-NEXT: vmovdqa %ymm2, 32(%rsi)
2445 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
2446 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
2447 ; AVX512-NEXT: vmovdqa %ymm4, 32(%rcx)
2448 ; AVX512-NEXT: vmovdqa %ymm1, (%rcx)
2449 ; AVX512-NEXT: vzeroupper
2452 ; AVX512-FCP-LABEL: load_i8_stride3_vf64:
2453 ; AVX512-FCP: # %bb.0:
2454 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
2455 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2456 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
2457 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2458 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2459 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2460 ; AVX512-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2461 ; AVX512-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2462 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2463 ; AVX512-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2464 ; AVX512-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2465 ; AVX512-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2466 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2467 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
2468 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2469 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2470 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2471 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2472 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2473 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2474 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2475 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
2476 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2477 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2478 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8
2479 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
2480 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
2481 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2
2482 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2483 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2484 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2485 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2486 ; AVX512-FCP-NEXT: vpternlogq $202, %zmm2, %zmm8, %zmm5
2487 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
2488 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2489 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
2490 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2491 ; AVX512-FCP-NEXT: vmovdqa %ymm2, 32(%rsi)
2492 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
2493 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
2494 ; AVX512-FCP-NEXT: vmovdqa %ymm4, 32(%rcx)
2495 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rcx)
2496 ; AVX512-FCP-NEXT: vzeroupper
2497 ; AVX512-FCP-NEXT: retq
2499 ; AVX512DQ-LABEL: load_i8_stride3_vf64:
2500 ; AVX512DQ: # %bb.0:
2501 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
2502 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
2503 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
2504 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
2505 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
2506 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
2507 ; AVX512DQ-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2508 ; AVX512DQ-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2509 ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2510 ; AVX512DQ-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2511 ; AVX512DQ-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2512 ; AVX512DQ-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2513 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2514 ; AVX512DQ-NEXT: # ymm6 = mem[0,1,0,1]
2515 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2516 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2517 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2518 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2519 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2520 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2521 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2522 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
2523 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2524 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2525 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8
2526 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
2527 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
2528 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2
2529 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2530 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2531 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2532 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2533 ; AVX512DQ-NEXT: vpternlogq $202, %zmm2, %zmm8, %zmm5
2534 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
2535 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2536 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
2537 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2538 ; AVX512DQ-NEXT: vmovdqa %ymm2, 32(%rsi)
2539 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
2540 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
2541 ; AVX512DQ-NEXT: vmovdqa %ymm4, 32(%rcx)
2542 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
2543 ; AVX512DQ-NEXT: vzeroupper
2544 ; AVX512DQ-NEXT: retq
2546 ; AVX512DQ-FCP-LABEL: load_i8_stride3_vf64:
2547 ; AVX512DQ-FCP: # %bb.0:
2548 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
2549 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2550 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
2551 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2552 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2553 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2554 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2555 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2556 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
2557 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2558 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm5
2559 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2560 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2561 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
2562 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
2563 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
2564 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
2565 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
2566 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
2567 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
2568 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm5[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm5[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
2569 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
2570 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2571 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2572 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm8
2573 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10],ymm4[27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26]
2574 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
2575 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm2
2576 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm7[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
2577 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm6[11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10],ymm6[27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26]
2578 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2579 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2580 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %zmm2, %zmm8, %zmm5
2581 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26]
2582 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2583 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26]
2584 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
2585 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, 32(%rsi)
2586 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
2587 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
2588 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, 32(%rcx)
2589 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rcx)
2590 ; AVX512DQ-FCP-NEXT: vzeroupper
2591 ; AVX512DQ-FCP-NEXT: retq
2593 ; AVX512BW-LABEL: load_i8_stride3_vf64:
2594 ; AVX512BW: # %bb.0:
2595 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
2596 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
2597 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
2598 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
2599 ; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm4
2600 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5
2601 ; AVX512BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2602 ; AVX512BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2603 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2604 ; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2605 ; AVX512BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2606 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
2607 ; AVX512BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
2608 ; AVX512BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2609 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
2610 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2611 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2612 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
2613 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
2614 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
2615 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
2616 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2617 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
2618 ; AVX512BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
2619 ; AVX512BW-NEXT: kmovq %rax, %k1
2620 ; AVX512BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
2621 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2622 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
2623 ; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
2624 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rsi)
2625 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx)
2626 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rcx)
2627 ; AVX512BW-NEXT: vzeroupper
2628 ; AVX512BW-NEXT: retq
2630 ; AVX512BW-FCP-LABEL: load_i8_stride3_vf64:
2631 ; AVX512BW-FCP: # %bb.0:
2632 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
2633 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2634 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
2635 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2636 ; AVX512BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2637 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2638 ; AVX512BW-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2639 ; AVX512BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2640 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2641 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2642 ; AVX512BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2643 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
2644 ; AVX512BW-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
2645 ; AVX512BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2646 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
2647 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2648 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2649 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0
2650 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1
2651 ; AVX512BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2
2652 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
2653 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2654 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
2655 ; AVX512BW-FCP-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
2656 ; AVX512BW-FCP-NEXT: kmovq %rax, %k1
2657 ; AVX512BW-FCP-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
2658 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2659 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
2660 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
2661 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
2662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
2663 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
2664 ; AVX512BW-FCP-NEXT: vzeroupper
2665 ; AVX512BW-FCP-NEXT: retq
2667 ; AVX512DQ-BW-LABEL: load_i8_stride3_vf64:
2668 ; AVX512DQ-BW: # %bb.0:
2669 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
2670 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
2671 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
2672 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm3
2673 ; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm4
2674 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5
2675 ; AVX512DQ-BW-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2676 ; AVX512DQ-BW-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2677 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2678 ; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2679 ; AVX512DQ-BW-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2680 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
2681 ; AVX512DQ-BW-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
2682 ; AVX512DQ-BW-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2683 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
2684 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2685 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2686 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
2687 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
2688 ; AVX512DQ-BW-NEXT: vpshufb %zmm3, %zmm2, %zmm2
2689 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
2690 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2691 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
2692 ; AVX512DQ-BW-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
2693 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1
2694 ; AVX512DQ-BW-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
2695 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2696 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
2697 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
2698 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rsi)
2699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx)
2700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rcx)
2701 ; AVX512DQ-BW-NEXT: vzeroupper
2702 ; AVX512DQ-BW-NEXT: retq
2704 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride3_vf64:
2705 ; AVX512DQ-BW-FCP: # %bb.0:
2706 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
2707 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2708 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
2709 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2710 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 112(%rdi), %xmm4
2711 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
2712 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
2713 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
2714 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
2715 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
2716 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
2717 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
2718 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
2719 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
2720 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
2721 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
2722 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2723 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm0, %zmm0
2724 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm1, %zmm1
2725 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm3, %zmm2, %zmm2
2726 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm3 = zmm2[11,12,13,14,15],zmm0[0,1,2,3,4,5,6,7,8,9,10],zmm2[27,28,29,30,31],zmm0[16,17,18,19,20,21,22,23,24,25,26],zmm2[43,44,45,46,47],zmm0[32,33,34,35,36,37,38,39,40,41,42],zmm2[59,60,61,62,63],zmm0[48,49,50,51,52,53,54,55,56,57,58]
2727 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2728 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm1[11,12,13,14,15],zmm2[0,1,2,3,4,5,6,7,8,9,10],zmm1[27,28,29,30,31],zmm2[16,17,18,19,20,21,22,23,24,25,26],zmm1[43,44,45,46,47],zmm2[32,33,34,35,36,37,38,39,40,41,42],zmm1[59,60,61,62,63],zmm2[48,49,50,51,52,53,54,55,56,57,58]
2729 ; AVX512DQ-BW-FCP-NEXT: movabsq $-576188069258921984, %rax # imm = 0xF800F800F800F800
2730 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
2731 ; AVX512DQ-BW-FCP-NEXT: vpblendmb %zmm1, %zmm0, %zmm2 {%k1}
2732 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm1 = zmm3[11,12,13,14,15],zmm1[0,1,2,3,4,5,6,7,8,9,10],zmm3[27,28,29,30,31],zmm1[16,17,18,19,20,21,22,23,24,25,26],zmm3[43,44,45,46,47],zmm1[32,33,34,35,36,37,38,39,40,41,42],zmm3[59,60,61,62,63],zmm1[48,49,50,51,52,53,54,55,56,57,58]
2733 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[11,12,13,14,15],zmm3[0,1,2,3,4,5,6,7,8,9,10],zmm0[27,28,29,30,31],zmm3[16,17,18,19,20,21,22,23,24,25,26],zmm0[43,44,45,46,47],zmm3[32,33,34,35,36,37,38,39,40,41,42],zmm0[59,60,61,62,63],zmm3[48,49,50,51,52,53,54,55,56,57,58]
2734 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} zmm0 = zmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,42,43,44,45,46,47,32,33,34,35,36,37,38,39,40,41,58,59,60,61,62,63,48,49,50,51,52,53,54,55,56,57]
2735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
2736 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
2737 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
2738 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2739 ; AVX512DQ-BW-FCP-NEXT: retq
2740 %wide.vec = load <192 x i8>, ptr %in.vec, align 64
2741 %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189>
2742 %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190>
2743 %strided.vec2 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191>
2744 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
2745 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
2746 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64