1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
19 ; SSE-LABEL: load_i8_stride5_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm1
22 ; SSE-NEXT: pxor %xmm2, %xmm2
23 ; SSE-NEXT: movdqa %xmm1, %xmm0
24 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
27 ; SSE-NEXT: packuswb %xmm3, %xmm3
28 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,2,3]
29 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
30 ; SSE-NEXT: packuswb %xmm4, %xmm4
31 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3]
32 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
33 ; SSE-NEXT: packuswb %xmm5, %xmm5
34 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
35 ; SSE-NEXT: psrlq $48, %xmm0
36 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
37 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
38 ; SSE-NEXT: packuswb %xmm0, %xmm0
39 ; SSE-NEXT: psrld $16, %xmm1
40 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
41 ; SSE-NEXT: packuswb %xmm6, %xmm6
42 ; SSE-NEXT: movd %xmm3, %eax
43 ; SSE-NEXT: movw %ax, (%rsi)
44 ; SSE-NEXT: movd %xmm4, %eax
45 ; SSE-NEXT: movw %ax, (%rdx)
46 ; SSE-NEXT: movd %xmm5, %eax
47 ; SSE-NEXT: movw %ax, (%rcx)
48 ; SSE-NEXT: movd %xmm0, %eax
49 ; SSE-NEXT: movw %ax, (%r8)
50 ; SSE-NEXT: movd %xmm6, %eax
51 ; SSE-NEXT: movw %ax, (%r9)
54 ; AVX-LABEL: load_i8_stride5_vf2:
56 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
58 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
59 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
60 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
61 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
62 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
63 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
64 ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx)
65 ; AVX-NEXT: vpextrw $0, %xmm4, (%r8)
66 ; AVX-NEXT: vpextrw $0, %xmm0, (%r9)
69 ; AVX2-LABEL: load_i8_stride5_vf2:
71 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
72 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
73 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
74 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
75 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
76 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
77 ; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi)
78 ; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx)
79 ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx)
80 ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8)
81 ; AVX2-NEXT: vpextrw $0, %xmm0, (%r9)
84 ; AVX2-FP-LABEL: load_i8_stride5_vf2:
86 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
87 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
88 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
89 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
90 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
91 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
92 ; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi)
93 ; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx)
94 ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx)
95 ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8)
96 ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%r9)
99 ; AVX2-FCP-LABEL: load_i8_stride5_vf2:
101 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
102 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
103 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
104 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
105 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
106 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
107 ; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
108 ; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
109 ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
110 ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
111 ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%r9)
112 ; AVX2-FCP-NEXT: retq
114 ; AVX512-LABEL: load_i8_stride5_vf2:
116 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
117 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
118 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
119 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
120 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
121 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122 ; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi)
123 ; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx)
124 ; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx)
125 ; AVX512-NEXT: vpextrw $0, %xmm4, (%r8)
126 ; AVX512-NEXT: vpextrw $0, %xmm0, (%r9)
129 ; AVX512-FCP-LABEL: load_i8_stride5_vf2:
130 ; AVX512-FCP: # %bb.0:
131 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
132 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
133 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
134 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
135 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
136 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
137 ; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
138 ; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
139 ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
140 ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
141 ; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%r9)
142 ; AVX512-FCP-NEXT: retq
144 ; AVX512DQ-LABEL: load_i8_stride5_vf2:
146 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
147 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
148 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
149 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
150 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
151 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
152 ; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi)
153 ; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx)
154 ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx)
155 ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8)
156 ; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%r9)
157 ; AVX512DQ-NEXT: retq
159 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf2:
160 ; AVX512DQ-FCP: # %bb.0:
161 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
162 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
163 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
164 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
165 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
166 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
167 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
168 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
169 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
170 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
171 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%r9)
172 ; AVX512DQ-FCP-NEXT: retq
174 ; AVX512BW-LABEL: load_i8_stride5_vf2:
176 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
177 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
178 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
179 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
180 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
181 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
182 ; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi)
183 ; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx)
184 ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx)
185 ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8)
186 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%r9)
187 ; AVX512BW-NEXT: retq
189 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf2:
190 ; AVX512BW-FCP: # %bb.0:
191 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
192 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
193 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
194 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
195 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
196 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
197 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
198 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
199 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
200 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
201 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%r9)
202 ; AVX512BW-FCP-NEXT: retq
204 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf2:
205 ; AVX512DQ-BW: # %bb.0:
206 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
207 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
208 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
209 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
210 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
211 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
212 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi)
213 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx)
214 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx)
215 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8)
216 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%r9)
217 ; AVX512DQ-BW-NEXT: retq
219 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf2:
220 ; AVX512DQ-BW-FCP: # %bb.0:
221 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
222 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
223 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
224 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
225 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
226 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
227 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
228 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
229 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
230 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
231 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%r9)
232 ; AVX512DQ-BW-FCP-NEXT: retq
233 %wide.vec = load <10 x i8>, ptr %in.vec, align 64
234 %strided.vec0 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 0, i32 5>
235 %strided.vec1 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 1, i32 6>
236 %strided.vec2 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 2, i32 7>
237 %strided.vec3 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 3, i32 8>
238 %strided.vec4 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 4, i32 9>
239 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
240 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
241 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
242 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
243 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
247 define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
248 ; SSE-LABEL: load_i8_stride5_vf4:
250 ; SSE-NEXT: movdqa (%rdi), %xmm5
251 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
252 ; SSE-NEXT: pxor %xmm4, %xmm4
253 ; SSE-NEXT: movdqa %xmm5, %xmm2
254 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
255 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
256 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,2,3,4,5,6,7]
257 ; SSE-NEXT: movdqa %xmm5, %xmm3
258 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
259 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
260 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
261 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
262 ; SSE-NEXT: packuswb %xmm1, %xmm1
263 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
264 ; SSE-NEXT: movdqa %xmm5, %xmm7
265 ; SSE-NEXT: pand %xmm6, %xmm7
266 ; SSE-NEXT: pandn %xmm0, %xmm6
267 ; SSE-NEXT: por %xmm7, %xmm6
268 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
269 ; SSE-NEXT: movdqa %xmm2, %xmm7
270 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0]
271 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3]
272 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1,1,3]
273 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7]
274 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
275 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
276 ; SSE-NEXT: packuswb %xmm6, %xmm6
277 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,65535]
278 ; SSE-NEXT: movdqa %xmm5, %xmm8
279 ; SSE-NEXT: pand %xmm7, %xmm8
280 ; SSE-NEXT: pandn %xmm0, %xmm7
281 ; SSE-NEXT: por %xmm8, %xmm7
282 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
283 ; SSE-NEXT: movdqa %xmm2, %xmm8
284 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0]
285 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
286 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
287 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
288 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
289 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
290 ; SSE-NEXT: packuswb %xmm7, %xmm7
291 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
292 ; SSE-NEXT: pand %xmm8, %xmm5
293 ; SSE-NEXT: pandn %xmm0, %xmm8
294 ; SSE-NEXT: por %xmm5, %xmm8
295 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
296 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[2,0]
297 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,7]
298 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
299 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7]
300 ; SSE-NEXT: packuswb %xmm4, %xmm4
301 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
302 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
303 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
304 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
305 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
306 ; SSE-NEXT: packuswb %xmm2, %xmm2
307 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
308 ; SSE-NEXT: pand %xmm3, %xmm2
309 ; SSE-NEXT: pandn %xmm0, %xmm3
310 ; SSE-NEXT: por %xmm2, %xmm3
311 ; SSE-NEXT: movd %xmm1, (%rsi)
312 ; SSE-NEXT: movd %xmm6, (%rdx)
313 ; SSE-NEXT: movd %xmm7, (%rcx)
314 ; SSE-NEXT: movd %xmm4, (%r8)
315 ; SSE-NEXT: movd %xmm3, (%r9)
318 ; AVX-LABEL: load_i8_stride5_vf4:
320 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
321 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
322 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
323 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3
324 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
325 ; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4
326 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
327 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
328 ; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
329 ; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6
330 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
331 ; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
332 ; AVX-NEXT: vmovd %xmm3, (%rsi)
333 ; AVX-NEXT: vmovd %xmm4, (%rdx)
334 ; AVX-NEXT: vmovd %xmm5, (%rcx)
335 ; AVX-NEXT: vmovd %xmm6, (%r8)
336 ; AVX-NEXT: vmovd %xmm0, (%r9)
339 ; AVX2-LABEL: load_i8_stride5_vf4:
341 ; AVX2-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
342 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
343 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
344 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
345 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
346 ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
347 ; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
348 ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
349 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
350 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
351 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
352 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
353 ; AVX2-NEXT: vmovd %xmm3, (%rsi)
354 ; AVX2-NEXT: vmovd %xmm4, (%rdx)
355 ; AVX2-NEXT: vmovd %xmm5, (%rcx)
356 ; AVX2-NEXT: vmovd %xmm6, (%r8)
357 ; AVX2-NEXT: vmovd %xmm0, (%r9)
360 ; AVX2-FP-LABEL: load_i8_stride5_vf4:
362 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
363 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
364 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
365 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
366 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
367 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
368 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
369 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
370 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
371 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
372 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
373 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
374 ; AVX2-FP-NEXT: vmovd %xmm3, (%rsi)
375 ; AVX2-FP-NEXT: vmovd %xmm4, (%rdx)
376 ; AVX2-FP-NEXT: vmovd %xmm5, (%rcx)
377 ; AVX2-FP-NEXT: vmovd %xmm6, (%r8)
378 ; AVX2-FP-NEXT: vmovd %xmm0, (%r9)
381 ; AVX2-FCP-LABEL: load_i8_stride5_vf4:
383 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
384 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
385 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
386 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
387 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
388 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
389 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
390 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
391 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
392 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
393 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
394 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
395 ; AVX2-FCP-NEXT: vmovd %xmm3, (%rsi)
396 ; AVX2-FCP-NEXT: vmovd %xmm4, (%rdx)
397 ; AVX2-FCP-NEXT: vmovd %xmm5, (%rcx)
398 ; AVX2-FCP-NEXT: vmovd %xmm6, (%r8)
399 ; AVX2-FCP-NEXT: vmovd %xmm0, (%r9)
400 ; AVX2-FCP-NEXT: retq
402 ; AVX512-LABEL: load_i8_stride5_vf4:
404 ; AVX512-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
405 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
406 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
407 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm3
408 ; AVX512-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
409 ; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
410 ; AVX512-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
411 ; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5
412 ; AVX512-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
413 ; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm6
414 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
415 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
416 ; AVX512-NEXT: vmovd %xmm3, (%rsi)
417 ; AVX512-NEXT: vmovd %xmm4, (%rdx)
418 ; AVX512-NEXT: vmovd %xmm5, (%rcx)
419 ; AVX512-NEXT: vmovd %xmm6, (%r8)
420 ; AVX512-NEXT: vmovd %xmm0, (%r9)
423 ; AVX512-FCP-LABEL: load_i8_stride5_vf4:
424 ; AVX512-FCP: # %bb.0:
425 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
426 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
427 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
428 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
429 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
430 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
431 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
432 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
433 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
434 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
435 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
436 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
437 ; AVX512-FCP-NEXT: vmovd %xmm3, (%rsi)
438 ; AVX512-FCP-NEXT: vmovd %xmm4, (%rdx)
439 ; AVX512-FCP-NEXT: vmovd %xmm5, (%rcx)
440 ; AVX512-FCP-NEXT: vmovd %xmm6, (%r8)
441 ; AVX512-FCP-NEXT: vmovd %xmm0, (%r9)
442 ; AVX512-FCP-NEXT: retq
444 ; AVX512DQ-LABEL: load_i8_stride5_vf4:
446 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
447 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
448 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2
449 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm3
450 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
451 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
452 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
453 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5
454 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
455 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6
456 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
457 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0
458 ; AVX512DQ-NEXT: vmovd %xmm3, (%rsi)
459 ; AVX512DQ-NEXT: vmovd %xmm4, (%rdx)
460 ; AVX512DQ-NEXT: vmovd %xmm5, (%rcx)
461 ; AVX512DQ-NEXT: vmovd %xmm6, (%r8)
462 ; AVX512DQ-NEXT: vmovd %xmm0, (%r9)
463 ; AVX512DQ-NEXT: retq
465 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf4:
466 ; AVX512DQ-FCP: # %bb.0:
467 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
468 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
469 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
470 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
471 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
472 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
473 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
474 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
475 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
476 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
477 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
478 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
479 ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rsi)
480 ; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rdx)
481 ; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%rcx)
482 ; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r8)
483 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r9)
484 ; AVX512DQ-FCP-NEXT: retq
486 ; AVX512BW-LABEL: load_i8_stride5_vf4:
488 ; AVX512BW-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
489 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
490 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
491 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm3
492 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
493 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm4, %xmm4
494 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
495 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm5, %xmm5
496 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
497 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
498 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
499 ; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
500 ; AVX512BW-NEXT: vmovd %xmm3, (%rsi)
501 ; AVX512BW-NEXT: vmovd %xmm4, (%rdx)
502 ; AVX512BW-NEXT: vmovd %xmm5, (%rcx)
503 ; AVX512BW-NEXT: vmovd %xmm6, (%r8)
504 ; AVX512BW-NEXT: vmovd %xmm0, (%r9)
505 ; AVX512BW-NEXT: retq
507 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf4:
508 ; AVX512BW-FCP: # %bb.0:
509 ; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
510 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
511 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
512 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
513 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
514 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
515 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
516 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
517 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
518 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
519 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
520 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
521 ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rsi)
522 ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rdx)
523 ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx)
524 ; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r8)
525 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r9)
526 ; AVX512BW-FCP-NEXT: retq
528 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf4:
529 ; AVX512DQ-BW: # %bb.0:
530 ; AVX512DQ-BW-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
531 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
532 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2
533 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm3
534 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
535 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm4, %xmm4
536 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
537 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm5, %xmm5
538 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
539 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm6
540 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
541 ; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm0
542 ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rsi)
543 ; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rdx)
544 ; AVX512DQ-BW-NEXT: vmovd %xmm5, (%rcx)
545 ; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r8)
546 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r9)
547 ; AVX512DQ-BW-NEXT: retq
549 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf4:
550 ; AVX512DQ-BW-FCP: # %bb.0:
551 ; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
554 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
555 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
556 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
557 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
558 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
559 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
560 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
561 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
562 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
563 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rsi)
564 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rdx)
565 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx)
566 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r8)
567 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r9)
568 ; AVX512DQ-BW-FCP-NEXT: retq
569 %wide.vec = load <20 x i8>, ptr %in.vec, align 64
570 %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
571 %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
572 %strided.vec2 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
573 %strided.vec3 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
574 %strided.vec4 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
575 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
576 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
577 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
578 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
579 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
583 define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
584 ; SSE-LABEL: load_i8_stride5_vf8:
586 ; SSE-NEXT: movdqa (%rdi), %xmm4
587 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
588 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
589 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
590 ; SSE-NEXT: movdqa %xmm1, %xmm2
591 ; SSE-NEXT: pandn %xmm3, %xmm2
592 ; SSE-NEXT: movdqa %xmm4, %xmm5
593 ; SSE-NEXT: pand %xmm1, %xmm5
594 ; SSE-NEXT: por %xmm2, %xmm5
595 ; SSE-NEXT: pxor %xmm6, %xmm6
596 ; SSE-NEXT: movdqa %xmm5, %xmm2
597 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
598 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
599 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
600 ; SSE-NEXT: pand %xmm7, %xmm5
601 ; SSE-NEXT: pandn %xmm2, %xmm7
602 ; SSE-NEXT: por %xmm5, %xmm7
603 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7]
604 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
605 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
606 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
607 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7]
608 ; SSE-NEXT: packuswb %xmm7, %xmm7
609 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
610 ; SSE-NEXT: pand %xmm2, %xmm7
611 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
612 ; SSE-NEXT: movdqa %xmm2, %xmm5
613 ; SSE-NEXT: pandn %xmm8, %xmm5
614 ; SSE-NEXT: por %xmm7, %xmm5
615 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
616 ; SSE-NEXT: movdqa %xmm3, %xmm8
617 ; SSE-NEXT: pand %xmm7, %xmm8
618 ; SSE-NEXT: pandn %xmm4, %xmm7
619 ; SSE-NEXT: por %xmm8, %xmm7
620 ; SSE-NEXT: movdqa %xmm7, %xmm8
621 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
622 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0]
623 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
624 ; SSE-NEXT: pand %xmm9, %xmm7
625 ; SSE-NEXT: pandn %xmm8, %xmm9
626 ; SSE-NEXT: por %xmm7, %xmm9
627 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3]
628 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5]
629 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
630 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
631 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7]
632 ; SSE-NEXT: packuswb %xmm7, %xmm7
633 ; SSE-NEXT: pand %xmm2, %xmm7
634 ; SSE-NEXT: movdqa %xmm0, %xmm8
635 ; SSE-NEXT: pslld $24, %xmm8
636 ; SSE-NEXT: pandn %xmm8, %xmm2
637 ; SSE-NEXT: por %xmm7, %xmm2
638 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
639 ; SSE-NEXT: movdqa %xmm3, %xmm8
640 ; SSE-NEXT: pand %xmm7, %xmm8
641 ; SSE-NEXT: pandn %xmm4, %xmm7
642 ; SSE-NEXT: por %xmm8, %xmm7
643 ; SSE-NEXT: movdqa %xmm7, %xmm9
644 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
645 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
646 ; SSE-NEXT: movdqa %xmm8, %xmm10
647 ; SSE-NEXT: pandn %xmm9, %xmm10
648 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
649 ; SSE-NEXT: pand %xmm8, %xmm7
650 ; SSE-NEXT: por %xmm10, %xmm7
651 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
652 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
653 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
654 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
655 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7]
656 ; SSE-NEXT: packuswb %xmm10, %xmm10
657 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
658 ; SSE-NEXT: pand %xmm7, %xmm10
659 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
660 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0]
661 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5]
662 ; SSE-NEXT: packuswb %xmm11, %xmm11
663 ; SSE-NEXT: movdqa %xmm7, %xmm9
664 ; SSE-NEXT: pandn %xmm11, %xmm9
665 ; SSE-NEXT: por %xmm10, %xmm9
666 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
667 ; SSE-NEXT: movdqa %xmm3, %xmm11
668 ; SSE-NEXT: pand %xmm10, %xmm11
669 ; SSE-NEXT: pandn %xmm4, %xmm10
670 ; SSE-NEXT: por %xmm11, %xmm10
671 ; SSE-NEXT: movdqa %xmm10, %xmm11
672 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
673 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
674 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0]
675 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5]
676 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0]
677 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7]
678 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7]
679 ; SSE-NEXT: packuswb %xmm11, %xmm11
680 ; SSE-NEXT: pand %xmm7, %xmm11
681 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3]
682 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6]
683 ; SSE-NEXT: packuswb %xmm12, %xmm12
684 ; SSE-NEXT: movdqa %xmm7, %xmm10
685 ; SSE-NEXT: pandn %xmm12, %xmm10
686 ; SSE-NEXT: por %xmm11, %xmm10
687 ; SSE-NEXT: pand %xmm1, %xmm3
688 ; SSE-NEXT: pandn %xmm4, %xmm1
689 ; SSE-NEXT: por %xmm3, %xmm1
690 ; SSE-NEXT: movdqa %xmm1, %xmm3
691 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
692 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
693 ; SSE-NEXT: pand %xmm8, %xmm1
694 ; SSE-NEXT: pandn %xmm3, %xmm8
695 ; SSE-NEXT: por %xmm1, %xmm8
696 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7]
697 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
698 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
699 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
700 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
701 ; SSE-NEXT: packuswb %xmm1, %xmm1
702 ; SSE-NEXT: pand %xmm7, %xmm1
703 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
704 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
705 ; SSE-NEXT: packuswb %xmm0, %xmm0
706 ; SSE-NEXT: pandn %xmm0, %xmm7
707 ; SSE-NEXT: por %xmm1, %xmm7
708 ; SSE-NEXT: movq %xmm5, (%rsi)
709 ; SSE-NEXT: movq %xmm2, (%rdx)
710 ; SSE-NEXT: movq %xmm9, (%rcx)
711 ; SSE-NEXT: movq %xmm10, (%r8)
712 ; SSE-NEXT: movq %xmm7, (%r9)
715 ; AVX-LABEL: load_i8_stride5_vf8:
717 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
718 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
719 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
720 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
721 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
722 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
723 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
724 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u]
725 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
726 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
727 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
728 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
729 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
730 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
731 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
732 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
733 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
734 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
735 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
736 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
737 ; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
738 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
739 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
740 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u]
741 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
742 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
743 ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
744 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u]
745 ; AVX-NEXT: vmovq %xmm3, (%rsi)
746 ; AVX-NEXT: vmovq %xmm4, (%rdx)
747 ; AVX-NEXT: vmovq %xmm5, (%rcx)
748 ; AVX-NEXT: vmovq %xmm6, (%r8)
749 ; AVX-NEXT: vmovq %xmm0, (%r9)
752 ; AVX2-LABEL: load_i8_stride5_vf8:
754 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
755 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
756 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
757 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
758 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
759 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
760 ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
761 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
762 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
763 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
764 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
765 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
766 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
767 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
768 ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
769 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
770 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
771 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
772 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
773 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
774 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
775 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
776 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
777 ; AVX2-NEXT: vmovq %xmm3, (%rsi)
778 ; AVX2-NEXT: vmovq %xmm4, (%rdx)
779 ; AVX2-NEXT: vmovq %xmm5, (%rcx)
780 ; AVX2-NEXT: vmovq %xmm6, (%r8)
781 ; AVX2-NEXT: vmovq %xmm0, (%r9)
784 ; AVX2-FP-LABEL: load_i8_stride5_vf8:
786 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
787 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
788 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
789 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
790 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
791 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
792 ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
793 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
794 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
795 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
796 ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
797 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
798 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
799 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
800 ; AVX2-FP-NEXT: vpor %xmm5, %xmm6, %xmm5
801 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
802 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
803 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
804 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
805 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
806 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
807 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
808 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
809 ; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
810 ; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
811 ; AVX2-FP-NEXT: vmovq %xmm5, (%rcx)
812 ; AVX2-FP-NEXT: vmovq %xmm6, (%r8)
813 ; AVX2-FP-NEXT: vmovq %xmm0, (%r9)
816 ; AVX2-FCP-LABEL: load_i8_stride5_vf8:
818 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
819 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
820 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
821 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
822 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
823 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
824 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
825 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
826 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
827 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
828 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
829 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
830 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
831 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
832 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
833 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
834 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
835 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
836 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
837 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
838 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
839 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
840 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
841 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
842 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
843 ; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx)
844 ; AVX2-FCP-NEXT: vmovq %xmm6, (%r8)
845 ; AVX2-FCP-NEXT: vmovq %xmm0, (%r9)
846 ; AVX2-FCP-NEXT: retq
848 ; AVX512-LABEL: load_i8_stride5_vf8:
850 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
851 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
852 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
853 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
854 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
855 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
856 ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
857 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
858 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
859 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
860 ; AVX512-NEXT: vpor %xmm4, %xmm5, %xmm4
861 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
862 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
863 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
864 ; AVX512-NEXT: vpor %xmm5, %xmm6, %xmm5
865 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
866 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
867 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
868 ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
869 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
870 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
871 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
872 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
873 ; AVX512-NEXT: vmovq %xmm3, (%rsi)
874 ; AVX512-NEXT: vmovq %xmm4, (%rdx)
875 ; AVX512-NEXT: vmovq %xmm5, (%rcx)
876 ; AVX512-NEXT: vmovq %xmm6, (%r8)
877 ; AVX512-NEXT: vmovq %xmm0, (%r9)
880 ; AVX512-FCP-LABEL: load_i8_stride5_vf8:
881 ; AVX512-FCP: # %bb.0:
882 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
883 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
884 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
885 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
886 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
887 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
888 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
889 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
890 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
891 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
892 ; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
893 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
894 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
895 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
896 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
897 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
898 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
899 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
900 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
901 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
902 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
903 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
904 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
905 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
906 ; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
907 ; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
908 ; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
909 ; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
910 ; AVX512-FCP-NEXT: retq
912 ; AVX512DQ-LABEL: load_i8_stride5_vf8:
914 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
915 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
916 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
917 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
918 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
919 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
920 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
921 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
922 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
923 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
924 ; AVX512DQ-NEXT: vpor %xmm4, %xmm5, %xmm4
925 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
926 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
927 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
928 ; AVX512DQ-NEXT: vpor %xmm5, %xmm6, %xmm5
929 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
930 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
931 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
932 ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
933 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
934 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
935 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
936 ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
937 ; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
938 ; AVX512DQ-NEXT: vmovq %xmm4, (%rdx)
939 ; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
940 ; AVX512DQ-NEXT: vmovq %xmm6, (%r8)
941 ; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
942 ; AVX512DQ-NEXT: retq
944 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf8:
945 ; AVX512DQ-FCP: # %bb.0:
946 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
947 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
948 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
949 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
950 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
951 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
952 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
953 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
954 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
955 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
956 ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
957 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
958 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
959 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
960 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
961 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
962 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
963 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
964 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
965 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
966 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
967 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
968 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
969 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
970 ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
971 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
972 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
973 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
974 ; AVX512DQ-FCP-NEXT: retq
976 ; AVX512BW-LABEL: load_i8_stride5_vf8:
978 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
979 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
980 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
981 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
982 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
983 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
984 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
985 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
986 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
987 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
988 ; AVX512BW-NEXT: vpor %xmm4, %xmm5, %xmm4
989 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
990 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
991 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
992 ; AVX512BW-NEXT: vpor %xmm5, %xmm6, %xmm5
993 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
994 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
995 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
996 ; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
997 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
998 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
999 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1000 ; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1001 ; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
1002 ; AVX512BW-NEXT: vmovq %xmm4, (%rdx)
1003 ; AVX512BW-NEXT: vmovq %xmm5, (%rcx)
1004 ; AVX512BW-NEXT: vmovq %xmm6, (%r8)
1005 ; AVX512BW-NEXT: vmovq %xmm0, (%r9)
1006 ; AVX512BW-NEXT: retq
1008 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf8:
1009 ; AVX512BW-FCP: # %bb.0:
1010 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1011 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1012 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1013 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1014 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1015 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1016 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
1017 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1018 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1019 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1020 ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
1021 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1022 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1023 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1024 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
1025 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1026 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1027 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1028 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
1029 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1030 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1031 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1032 ; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
1033 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
1034 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rdx)
1035 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%rcx)
1036 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r8)
1037 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
1038 ; AVX512BW-FCP-NEXT: retq
1040 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf8:
1041 ; AVX512DQ-BW: # %bb.0:
1042 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
1043 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
1044 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
1045 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1046 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1047 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1048 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
1049 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1050 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1051 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1052 ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm5, %xmm4
1053 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1054 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1055 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1056 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm6, %xmm5
1057 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1058 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1059 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1060 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6
1061 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1062 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1063 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1064 ; AVX512DQ-BW-NEXT: vpor %xmm1, %xmm0, %xmm0
1065 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
1066 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx)
1067 ; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx)
1068 ; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r8)
1069 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
1070 ; AVX512DQ-BW-NEXT: retq
1072 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf8:
1073 ; AVX512DQ-BW-FCP: # %bb.0:
1074 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
1076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
1077 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
1078 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1079 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
1080 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
1081 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
1082 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
1083 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
1084 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
1085 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1086 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
1087 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
1088 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
1089 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1090 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
1091 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
1092 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
1093 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
1094 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
1095 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
1096 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
1097 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
1098 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rdx)
1099 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%rcx)
1100 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r8)
1101 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
1102 ; AVX512DQ-BW-FCP-NEXT: retq
1103 %wide.vec = load <40 x i8>, ptr %in.vec, align 64
1104 %strided.vec0 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35>
1105 %strided.vec1 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36>
1106 %strided.vec2 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37>
1107 %strided.vec3 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38>
1108 %strided.vec4 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39>
1109 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
1110 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
1111 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
1112 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
1113 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
1117 define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
1118 ; SSE-LABEL: load_i8_stride5_vf16:
1120 ; SSE-NEXT: movdqa 64(%rdi), %xmm9
1121 ; SSE-NEXT: movdqa (%rdi), %xmm1
1122 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
1123 ; SSE-NEXT: movdqa 32(%rdi), %xmm10
1124 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
1125 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1126 ; SSE-NEXT: movdqa %xmm3, %xmm0
1127 ; SSE-NEXT: pandn %xmm10, %xmm0
1128 ; SSE-NEXT: movdqa %xmm2, %xmm4
1129 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1130 ; SSE-NEXT: pand %xmm3, %xmm4
1131 ; SSE-NEXT: por %xmm0, %xmm4
1132 ; SSE-NEXT: pxor %xmm8, %xmm8
1133 ; SSE-NEXT: movdqa %xmm4, %xmm0
1134 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1135 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
1136 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
1137 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
1138 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1139 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
1140 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
1141 ; SSE-NEXT: packuswb %xmm4, %xmm0
1142 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
1143 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
1144 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
1145 ; SSE-NEXT: movdqa %xmm4, %xmm5
1146 ; SSE-NEXT: pandn %xmm6, %xmm5
1147 ; SSE-NEXT: movdqa %xmm6, %xmm15
1148 ; SSE-NEXT: movdqa %xmm1, %xmm6
1149 ; SSE-NEXT: movdqa %xmm1, %xmm13
1150 ; SSE-NEXT: pand %xmm4, %xmm6
1151 ; SSE-NEXT: por %xmm5, %xmm6
1152 ; SSE-NEXT: movdqa %xmm6, %xmm5
1153 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1154 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
1155 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
1156 ; SSE-NEXT: pand %xmm7, %xmm6
1157 ; SSE-NEXT: pandn %xmm5, %xmm7
1158 ; SSE-NEXT: por %xmm6, %xmm7
1159 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7]
1160 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7]
1161 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
1162 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7]
1163 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7]
1164 ; SSE-NEXT: packuswb %xmm7, %xmm7
1165 ; SSE-NEXT: pand %xmm11, %xmm7
1166 ; SSE-NEXT: movdqa %xmm11, %xmm5
1167 ; SSE-NEXT: pandn %xmm0, %xmm5
1168 ; SSE-NEXT: por %xmm5, %xmm7
1169 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
1170 ; SSE-NEXT: pand %xmm6, %xmm7
1171 ; SSE-NEXT: movdqa %xmm9, %xmm1
1172 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
1173 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1174 ; SSE-NEXT: movdqa %xmm1, %xmm0
1175 ; SSE-NEXT: movdqa %xmm1, %xmm5
1176 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1177 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
1178 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3]
1179 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
1180 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1181 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
1182 ; SSE-NEXT: packuswb %xmm0, %xmm0
1183 ; SSE-NEXT: movdqa %xmm6, %xmm1
1184 ; SSE-NEXT: pandn %xmm0, %xmm1
1185 ; SSE-NEXT: por %xmm7, %xmm1
1186 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1187 ; SSE-NEXT: movdqa %xmm4, %xmm7
1188 ; SSE-NEXT: pandn %xmm10, %xmm7
1189 ; SSE-NEXT: movdqa %xmm2, %xmm0
1190 ; SSE-NEXT: pand %xmm4, %xmm0
1191 ; SSE-NEXT: por %xmm7, %xmm0
1192 ; SSE-NEXT: movdqa %xmm0, %xmm12
1193 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
1194 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
1195 ; SSE-NEXT: movdqa %xmm0, %xmm7
1196 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0]
1197 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3]
1198 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1199 ; SSE-NEXT: movdqa %xmm14, %xmm12
1200 ; SSE-NEXT: movdqa %xmm13, %xmm1
1201 ; SSE-NEXT: pandn %xmm13, %xmm12
1202 ; SSE-NEXT: movdqa %xmm15, %xmm13
1203 ; SSE-NEXT: movdqa %xmm15, %xmm2
1204 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1205 ; SSE-NEXT: pand %xmm14, %xmm13
1206 ; SSE-NEXT: por %xmm12, %xmm13
1207 ; SSE-NEXT: movdqa %xmm13, %xmm12
1208 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
1209 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
1210 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0]
1211 ; SSE-NEXT: pand %xmm15, %xmm13
1212 ; SSE-NEXT: pandn %xmm12, %xmm15
1213 ; SSE-NEXT: por %xmm13, %xmm15
1214 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3]
1215 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
1216 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
1217 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7]
1218 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7]
1219 ; SSE-NEXT: packuswb %xmm12, %xmm12
1220 ; SSE-NEXT: pand %xmm11, %xmm12
1221 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
1222 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
1223 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1]
1224 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
1225 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
1226 ; SSE-NEXT: psllq $48, %xmm0
1227 ; SSE-NEXT: packuswb %xmm7, %xmm0
1228 ; SSE-NEXT: movdqa %xmm5, %xmm7
1229 ; SSE-NEXT: pandn %xmm0, %xmm11
1230 ; SSE-NEXT: por %xmm11, %xmm12
1231 ; SSE-NEXT: pand %xmm6, %xmm12
1232 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0]
1233 ; SSE-NEXT: movaps %xmm9, %xmm0
1234 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2]
1235 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1236 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1237 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
1238 ; SSE-NEXT: packuswb %xmm0, %xmm0
1239 ; SSE-NEXT: movdqa %xmm6, %xmm5
1240 ; SSE-NEXT: pandn %xmm0, %xmm5
1241 ; SSE-NEXT: por %xmm12, %xmm5
1242 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1243 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
1244 ; SSE-NEXT: movdqa %xmm12, %xmm0
1245 ; SSE-NEXT: pandn %xmm1, %xmm0
1246 ; SSE-NEXT: movdqa %xmm1, %xmm5
1247 ; SSE-NEXT: pand %xmm12, %xmm2
1248 ; SSE-NEXT: por %xmm0, %xmm2
1249 ; SSE-NEXT: movdqa %xmm2, %xmm0
1250 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1251 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535]
1252 ; SSE-NEXT: movdqa %xmm13, %xmm15
1253 ; SSE-NEXT: pandn %xmm0, %xmm15
1254 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
1255 ; SSE-NEXT: pand %xmm13, %xmm2
1256 ; SSE-NEXT: por %xmm15, %xmm2
1257 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
1258 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1259 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1260 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1261 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1262 ; SSE-NEXT: packuswb %xmm0, %xmm0
1263 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535]
1264 ; SSE-NEXT: pandn %xmm0, %xmm15
1265 ; SSE-NEXT: movdqa %xmm4, %xmm0
1266 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1267 ; SSE-NEXT: pandn %xmm11, %xmm0
1268 ; SSE-NEXT: movdqa %xmm11, %xmm7
1269 ; SSE-NEXT: pand %xmm14, %xmm7
1270 ; SSE-NEXT: pandn %xmm10, %xmm14
1271 ; SSE-NEXT: pand %xmm12, %xmm11
1272 ; SSE-NEXT: pandn %xmm10, %xmm12
1273 ; SSE-NEXT: pand %xmm4, %xmm10
1274 ; SSE-NEXT: por %xmm0, %xmm10
1275 ; SSE-NEXT: movdqa %xmm10, %xmm0
1276 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1277 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
1278 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
1279 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0]
1280 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2]
1281 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1282 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1283 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1284 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1285 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
1286 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1287 ; SSE-NEXT: packuswb %xmm0, %xmm1
1288 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1289 ; SSE-NEXT: por %xmm15, %xmm1
1290 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
1291 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1292 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
1293 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
1294 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7]
1295 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1296 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1297 ; SSE-NEXT: packuswb %xmm0, %xmm0
1298 ; SSE-NEXT: movdqa %xmm6, %xmm10
1299 ; SSE-NEXT: pandn %xmm0, %xmm10
1300 ; SSE-NEXT: pand %xmm6, %xmm1
1301 ; SSE-NEXT: por %xmm1, %xmm10
1302 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1303 ; SSE-NEXT: movdqa %xmm15, %xmm0
1304 ; SSE-NEXT: pand %xmm3, %xmm0
1305 ; SSE-NEXT: pandn %xmm5, %xmm3
1306 ; SSE-NEXT: por %xmm0, %xmm3
1307 ; SSE-NEXT: movdqa %xmm3, %xmm0
1308 ; SSE-NEXT: pxor %xmm1, %xmm1
1309 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1310 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
1311 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0]
1312 ; SSE-NEXT: por %xmm7, %xmm14
1313 ; SSE-NEXT: movdqa %xmm14, %xmm0
1314 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1315 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
1316 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
1317 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7]
1318 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
1319 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
1320 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1321 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1322 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
1323 ; SSE-NEXT: packuswb %xmm1, %xmm0
1324 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
1325 ; SSE-NEXT: pand %xmm1, %xmm0
1326 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5]
1327 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0]
1328 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7]
1329 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7]
1330 ; SSE-NEXT: packuswb %xmm7, %xmm7
1331 ; SSE-NEXT: pandn %xmm7, %xmm1
1332 ; SSE-NEXT: movaps %xmm9, %xmm7
1333 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0]
1334 ; SSE-NEXT: por %xmm1, %xmm0
1335 ; SSE-NEXT: movaps %xmm2, %xmm1
1336 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2]
1337 ; SSE-NEXT: pand %xmm6, %xmm0
1338 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1339 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1340 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
1341 ; SSE-NEXT: packuswb %xmm1, %xmm1
1342 ; SSE-NEXT: pandn %xmm1, %xmm6
1343 ; SSE-NEXT: por %xmm0, %xmm6
1344 ; SSE-NEXT: por %xmm11, %xmm12
1345 ; SSE-NEXT: movdqa %xmm12, %xmm1
1346 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1347 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
1348 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
1349 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2]
1350 ; SSE-NEXT: movdqa %xmm15, %xmm1
1351 ; SSE-NEXT: pand %xmm4, %xmm1
1352 ; SSE-NEXT: pandn %xmm5, %xmm4
1353 ; SSE-NEXT: por %xmm1, %xmm4
1354 ; SSE-NEXT: movdqa %xmm4, %xmm1
1355 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
1356 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
1357 ; SSE-NEXT: pand %xmm13, %xmm4
1358 ; SSE-NEXT: pandn %xmm1, %xmm13
1359 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1]
1360 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
1361 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1362 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1363 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
1364 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
1365 ; SSE-NEXT: packuswb %xmm1, %xmm0
1366 ; SSE-NEXT: por %xmm4, %xmm13
1367 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
1368 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7]
1369 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4]
1370 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
1371 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7]
1372 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
1373 ; SSE-NEXT: packuswb %xmm4, %xmm4
1374 ; SSE-NEXT: pand %xmm3, %xmm4
1375 ; SSE-NEXT: pandn %xmm0, %xmm3
1376 ; SSE-NEXT: por %xmm3, %xmm4
1377 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
1378 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
1379 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1380 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
1381 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1382 ; SSE-NEXT: packuswb %xmm1, %xmm2
1383 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1]
1384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1385 ; SSE-NEXT: movaps %xmm0, (%rsi)
1386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1387 ; SSE-NEXT: movaps %xmm0, (%rdx)
1388 ; SSE-NEXT: movdqa %xmm10, (%rcx)
1389 ; SSE-NEXT: movdqa %xmm6, (%r8)
1390 ; SSE-NEXT: movaps %xmm4, (%r9)
1393 ; AVX-LABEL: load_i8_stride5_vf16:
1395 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1396 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
1397 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1398 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
1399 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
1400 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
1401 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1402 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
1403 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
1404 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
1405 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
1406 ; AVX-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
1407 ; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1408 ; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm5
1409 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm4
1410 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1411 ; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5
1412 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
1413 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u]
1414 ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
1415 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u]
1416 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u]
1417 ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
1418 ; AVX-NEXT: vpblendvb %xmm6, %xmm8, %xmm9, %xmm6
1419 ; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm6
1420 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1421 ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6
1422 ; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
1423 ; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm9
1424 ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm8
1425 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4,5,6,7]
1426 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
1427 ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10
1428 ; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9
1429 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7]
1430 ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
1431 ; AVX-NEXT: vpshufb %xmm7, %xmm8, %xmm8
1432 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1433 ; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8
1434 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u]
1435 ; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm10
1436 ; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm9
1437 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4,5,6,7]
1438 ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u]
1439 ; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm11
1440 ; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm10
1441 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4,5,6,7]
1442 ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
1443 ; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm7
1444 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1445 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
1446 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
1447 ; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
1448 ; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2
1449 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7]
1450 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
1451 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1452 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
1453 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7]
1454 ; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
1455 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1456 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
1457 ; AVX-NEXT: vmovdqa %xmm5, (%rsi)
1458 ; AVX-NEXT: vmovdqa %xmm6, (%rdx)
1459 ; AVX-NEXT: vmovdqa %xmm8, (%rcx)
1460 ; AVX-NEXT: vmovdqa %xmm7, (%r8)
1461 ; AVX-NEXT: vmovdqa %xmm0, (%r9)
1464 ; AVX2-LABEL: load_i8_stride5_vf16:
1466 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1467 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1468 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1469 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1470 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1471 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1472 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1473 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
1474 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1475 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm3
1476 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
1477 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1478 ; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
1479 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1480 ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1481 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1482 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
1483 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1484 ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
1485 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1486 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1487 ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
1488 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1489 ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1490 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
1491 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1492 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1493 ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
1494 ; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1495 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1496 ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
1497 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1498 ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1499 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1500 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7
1501 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1502 ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
1503 ; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4
1504 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1505 ; AVX2-NEXT: vpor %xmm7, %xmm4, %xmm4
1506 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1507 ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1508 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1509 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1510 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1511 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
1512 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1513 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1514 ; AVX2-NEXT: vmovdqa %xmm3, (%rsi)
1515 ; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
1516 ; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
1517 ; AVX2-NEXT: vmovdqa %xmm4, (%r8)
1518 ; AVX2-NEXT: vmovdqa %xmm0, (%r9)
1519 ; AVX2-NEXT: vzeroupper
1522 ; AVX2-FP-LABEL: load_i8_stride5_vf16:
1524 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
1525 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
1526 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1527 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1528 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1529 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1530 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1531 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
1532 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1533 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm3
1534 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
1535 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1536 ; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3
1537 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1538 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1539 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1540 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
1541 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1542 ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5
1543 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1544 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1545 ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5
1546 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1547 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1548 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
1549 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1550 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1551 ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6
1552 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1553 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1554 ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6
1555 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1556 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1557 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1558 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm7
1559 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1560 ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7
1561 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
1562 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1563 ; AVX2-FP-NEXT: vpor %xmm7, %xmm4, %xmm4
1564 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1565 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1566 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
1567 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1568 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1569 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
1570 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1571 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1572 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi)
1573 ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
1574 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx)
1575 ; AVX2-FP-NEXT: vmovdqa %xmm4, (%r8)
1576 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9)
1577 ; AVX2-FP-NEXT: vzeroupper
1578 ; AVX2-FP-NEXT: retq
1580 ; AVX2-FCP-LABEL: load_i8_stride5_vf16:
1581 ; AVX2-FCP: # %bb.0:
1582 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
1583 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1584 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1585 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1586 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1587 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1588 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1589 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1590 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1591 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3
1592 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
1593 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
1594 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
1595 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1596 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
1597 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1598 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
1599 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1600 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1601 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
1602 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
1603 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1604 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
1605 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1606 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1607 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1608 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1609 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1610 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
1611 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
1612 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1613 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
1614 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
1615 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1616 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
1617 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1618 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1619 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
1620 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
1621 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4
1622 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
1623 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
1624 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1625 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
1626 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
1627 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
1628 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1629 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1630 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi)
1631 ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx)
1632 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx)
1633 ; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r8)
1634 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9)
1635 ; AVX2-FCP-NEXT: vzeroupper
1636 ; AVX2-FCP-NEXT: retq
1638 ; AVX512-LABEL: load_i8_stride5_vf16:
1640 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1641 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4
1642 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
1643 ; AVX512-NEXT: vmovdqa %ymm1, %ymm0
1644 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0
1645 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
1646 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1647 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1648 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
1649 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1650 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm2
1651 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0
1652 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1653 ; AVX512-NEXT: vpor %xmm6, %xmm2, %xmm6
1654 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1655 ; AVX512-NEXT: vmovdqa %ymm2, %ymm7
1656 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
1657 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1658 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
1659 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1660 ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
1661 ; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm7
1662 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1663 ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
1664 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1665 ; AVX512-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8
1666 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
1667 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1668 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1669 ; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8
1670 ; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm8
1671 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1672 ; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm8
1673 ; AVX512-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1
1674 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1675 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
1676 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1677 ; AVX512-NEXT: vpor %xmm1, %xmm9, %xmm1
1678 ; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1679 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1680 ; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1
1681 ; AVX512-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2
1682 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
1683 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1684 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1685 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
1686 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1687 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1688 ; AVX512-NEXT: vmovdqa %xmm6, (%rsi)
1689 ; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
1690 ; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
1691 ; AVX512-NEXT: vmovdqa %xmm1, (%r8)
1692 ; AVX512-NEXT: vmovdqa %xmm0, (%r9)
1693 ; AVX512-NEXT: vzeroupper
1696 ; AVX512-FCP-LABEL: load_i8_stride5_vf16:
1697 ; AVX512-FCP: # %bb.0:
1698 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1699 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
1700 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
1701 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
1702 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0
1703 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
1704 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1705 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1706 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
1707 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1708 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2
1709 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
1710 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1711 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6
1712 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1713 ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm7
1714 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
1715 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1716 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
1717 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1718 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1719 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7
1720 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1721 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1722 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1723 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8
1724 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1725 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1726 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1727 ; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
1728 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
1729 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1730 ; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
1731 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1
1732 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1733 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
1734 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1735 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1
1736 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1737 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1738 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
1739 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2
1740 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1741 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1742 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1743 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1744 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1745 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1746 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rsi)
1747 ; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rdx)
1748 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx)
1749 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8)
1750 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r9)
1751 ; AVX512-FCP-NEXT: vzeroupper
1752 ; AVX512-FCP-NEXT: retq
1754 ; AVX512DQ-LABEL: load_i8_stride5_vf16:
1755 ; AVX512DQ: # %bb.0:
1756 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1757 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4
1758 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
1759 ; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm0
1760 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0
1761 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
1762 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1763 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1764 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
1765 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1766 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm2
1767 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0
1768 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1769 ; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm6
1770 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1771 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm7
1772 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
1773 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1774 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7
1775 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1776 ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7
1777 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm7
1778 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1779 ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7
1780 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1781 ; AVX512DQ-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8
1782 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
1783 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1784 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1785 ; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8
1786 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm8
1787 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1788 ; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm8
1789 ; AVX512DQ-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1
1790 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1791 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
1792 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1793 ; AVX512DQ-NEXT: vpor %xmm1, %xmm9, %xmm1
1794 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1795 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1796 ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1
1797 ; AVX512DQ-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2
1798 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
1799 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1800 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1801 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
1802 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1803 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1804 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi)
1805 ; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx)
1806 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx)
1807 ; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8)
1808 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9)
1809 ; AVX512DQ-NEXT: vzeroupper
1810 ; AVX512DQ-NEXT: retq
1812 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf16:
1813 ; AVX512DQ-FCP: # %bb.0:
1814 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1815 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
1816 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
1817 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
1818 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0
1819 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
1820 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
1821 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
1822 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
1823 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1824 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm2
1825 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
1826 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
1827 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm2, %xmm6
1828 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1829 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm7
1830 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
1831 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
1832 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
1833 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
1834 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1835 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm7
1836 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
1837 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1838 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1839 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8
1840 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1841 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
1842 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1843 ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
1844 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
1845 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
1846 ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
1847 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1
1848 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
1849 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
1850 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
1851 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm9, %xmm1
1852 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
1853 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
1854 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
1855 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2
1856 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1857 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1858 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1859 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1860 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1861 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
1862 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rsi)
1863 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rdx)
1864 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx)
1865 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8)
1866 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r9)
1867 ; AVX512DQ-FCP-NEXT: vzeroupper
1868 ; AVX512DQ-FCP-NEXT: retq
1870 ; AVX512BW-LABEL: load_i8_stride5_vf16:
1871 ; AVX512BW: # %bb.0:
1872 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
1873 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
1874 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
1875 ; AVX512BW-NEXT: kmovd %eax, %k1
1876 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1877 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
1878 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1879 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1880 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
1881 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1882 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1883 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4
1884 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1885 ; AVX512BW-NEXT: vpor %xmm5, %xmm2, %xmm2
1886 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
1887 ; AVX512BW-NEXT: kmovd %eax, %k2
1888 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
1889 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1890 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5
1891 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1892 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
1893 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1894 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1895 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
1896 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
1897 ; AVX512BW-NEXT: kmovd %eax, %k3
1898 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
1899 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
1900 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1901 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1902 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
1903 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
1904 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1905 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
1906 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
1907 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1908 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7
1909 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1910 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
1911 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
1912 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1913 ; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3
1914 ; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2}
1915 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0
1916 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
1917 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
1918 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
1919 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1920 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1921 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
1922 ; AVX512BW-NEXT: vmovdqa %xmm5, (%rdx)
1923 ; AVX512BW-NEXT: vmovdqa %xmm6, (%rcx)
1924 ; AVX512BW-NEXT: vmovdqa %xmm3, (%r8)
1925 ; AVX512BW-NEXT: vmovdqa %xmm0, (%r9)
1926 ; AVX512BW-NEXT: vzeroupper
1927 ; AVX512BW-NEXT: retq
1929 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf16:
1930 ; AVX512BW-FCP: # %bb.0:
1931 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
1932 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1933 ; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
1934 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
1935 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1936 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1937 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1938 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1939 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1940 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
1941 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
1942 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
1943 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
1944 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
1945 ; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
1946 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
1947 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
1948 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
1949 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
1950 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
1951 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1952 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
1953 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
1954 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1955 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
1956 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
1957 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
1958 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1959 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
1960 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
1961 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1962 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
1963 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
1964 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1965 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
1966 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
1967 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
1968 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
1969 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1970 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
1971 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
1972 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
1973 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2}
1974 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
1975 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
1976 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
1977 ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
1978 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1979 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
1980 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
1981 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rdx)
1982 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rcx)
1983 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
1984 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r9)
1985 ; AVX512BW-FCP-NEXT: vzeroupper
1986 ; AVX512BW-FCP-NEXT: retq
1988 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf16:
1989 ; AVX512DQ-BW: # %bb.0:
1990 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
1991 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1
1992 ; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52
1993 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
1994 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
1995 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
1996 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
1997 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
1998 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
1999 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
2000 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2001 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4
2002 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
2003 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm2, %xmm2
2004 ; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294
2005 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
2006 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
2007 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2008 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5
2009 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2010 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
2011 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
2012 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
2013 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
2014 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A
2015 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
2016 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
2017 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
2018 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
2019 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
2020 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
2021 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
2022 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
2023 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
2024 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
2025 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
2026 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7
2027 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
2028 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
2029 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
2030 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
2031 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm3, %xmm3
2032 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2}
2033 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm0
2034 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
2035 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
2036 ; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0
2037 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2038 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2039 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi)
2040 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rdx)
2041 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rcx)
2042 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%r8)
2043 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%r9)
2044 ; AVX512DQ-BW-NEXT: vzeroupper
2045 ; AVX512DQ-BW-NEXT: retq
2047 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf16:
2048 ; AVX512DQ-BW-FCP: # %bb.0:
2049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
2050 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
2051 ; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
2052 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
2053 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
2054 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
2055 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
2056 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
2057 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
2058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
2059 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
2060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
2061 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
2062 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
2063 ; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
2064 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
2065 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
2066 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2067 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
2068 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2069 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
2070 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
2071 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
2072 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
2073 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
2074 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
2075 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
2076 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
2077 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
2078 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
2079 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
2080 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
2081 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
2082 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
2083 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
2084 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
2085 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
2086 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
2087 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
2088 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
2089 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
2090 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
2091 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2}
2092 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
2093 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
2094 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
2095 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
2096 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2097 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
2098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
2099 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rdx)
2100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rcx)
2101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
2102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r9)
2103 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2104 ; AVX512DQ-BW-FCP-NEXT: retq
2105 %wide.vec = load <80 x i8>, ptr %in.vec, align 64
2106 %strided.vec0 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
2107 %strided.vec1 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76>
2108 %strided.vec2 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77>
2109 %strided.vec3 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78>
2110 %strided.vec4 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79>
2111 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
2112 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
2113 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
2114 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
2115 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
2119 define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
2120 ; SSE-LABEL: load_i8_stride5_vf32:
2122 ; SSE-NEXT: subq $184, %rsp
2123 ; SSE-NEXT: movdqa (%rdi), %xmm9
2124 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
2125 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2126 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
2127 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2128 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
2129 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2131 ; SSE-NEXT: movdqa %xmm4, %xmm0
2132 ; SSE-NEXT: pandn %xmm1, %xmm0
2133 ; SSE-NEXT: movdqa %xmm2, %xmm1
2134 ; SSE-NEXT: pand %xmm4, %xmm1
2135 ; SSE-NEXT: por %xmm0, %xmm1
2136 ; SSE-NEXT: pxor %xmm5, %xmm5
2137 ; SSE-NEXT: movdqa %xmm1, %xmm0
2138 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2139 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2140 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2141 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2142 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2143 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2144 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2145 ; SSE-NEXT: packuswb %xmm1, %xmm0
2146 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3]
2147 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
2148 ; SSE-NEXT: movdqa %xmm13, %xmm0
2149 ; SSE-NEXT: pandn %xmm1, %xmm0
2150 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2151 ; SSE-NEXT: movdqa %xmm15, %xmm1
2152 ; SSE-NEXT: pandn %xmm3, %xmm1
2153 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2154 ; SSE-NEXT: pandn %xmm9, %xmm11
2155 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2156 ; SSE-NEXT: movdqa %xmm14, %xmm2
2157 ; SSE-NEXT: pandn %xmm9, %xmm2
2158 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2159 ; SSE-NEXT: movdqa %xmm4, %xmm2
2160 ; SSE-NEXT: pandn %xmm9, %xmm2
2161 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2162 ; SSE-NEXT: movdqa %xmm15, %xmm2
2163 ; SSE-NEXT: pandn %xmm9, %xmm2
2164 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2165 ; SSE-NEXT: pand %xmm15, %xmm9
2166 ; SSE-NEXT: por %xmm1, %xmm9
2167 ; SSE-NEXT: movdqa %xmm9, %xmm2
2168 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2169 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
2170 ; SSE-NEXT: movdqa %xmm1, %xmm6
2171 ; SSE-NEXT: pandn %xmm2, %xmm6
2172 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
2173 ; SSE-NEXT: pand %xmm1, %xmm9
2174 ; SSE-NEXT: por %xmm6, %xmm9
2175 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7]
2176 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2177 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2178 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
2179 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2180 ; SSE-NEXT: packuswb %xmm2, %xmm2
2181 ; SSE-NEXT: pand %xmm13, %xmm2
2182 ; SSE-NEXT: por %xmm0, %xmm2
2183 ; SSE-NEXT: movdqa 64(%rdi), %xmm6
2184 ; SSE-NEXT: movdqa %xmm6, %xmm3
2185 ; SSE-NEXT: pxor %xmm0, %xmm0
2186 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2187 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2188 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
2189 ; SSE-NEXT: movdqa %xmm6, %xmm0
2190 ; SSE-NEXT: movdqa %xmm6, %xmm8
2191 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2192 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
2193 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
2194 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
2195 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2196 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
2197 ; SSE-NEXT: packuswb %xmm0, %xmm0
2198 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2199 ; SSE-NEXT: movdqa %xmm9, %xmm6
2200 ; SSE-NEXT: pandn %xmm0, %xmm6
2201 ; SSE-NEXT: pand %xmm9, %xmm2
2202 ; SSE-NEXT: por %xmm2, %xmm6
2203 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2204 ; SSE-NEXT: movdqa 112(%rdi), %xmm10
2205 ; SSE-NEXT: movdqa %xmm4, %xmm0
2206 ; SSE-NEXT: pandn %xmm10, %xmm0
2207 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2208 ; SSE-NEXT: movdqa 128(%rdi), %xmm7
2209 ; SSE-NEXT: movdqa %xmm7, %xmm2
2210 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2211 ; SSE-NEXT: pand %xmm4, %xmm2
2212 ; SSE-NEXT: por %xmm0, %xmm2
2213 ; SSE-NEXT: movdqa %xmm2, %xmm0
2214 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2215 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3]
2216 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
2217 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2218 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2219 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
2220 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
2221 ; SSE-NEXT: packuswb %xmm2, %xmm0
2222 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
2223 ; SSE-NEXT: movdqa %xmm13, %xmm2
2224 ; SSE-NEXT: movdqa %xmm13, %xmm3
2225 ; SSE-NEXT: pandn %xmm0, %xmm2
2226 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
2227 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
2228 ; SSE-NEXT: movdqa %xmm15, %xmm0
2229 ; SSE-NEXT: pandn %xmm4, %xmm0
2230 ; SSE-NEXT: movdqa 80(%rdi), %xmm6
2231 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2232 ; SSE-NEXT: pand %xmm15, %xmm6
2233 ; SSE-NEXT: por %xmm0, %xmm6
2234 ; SSE-NEXT: movdqa %xmm6, %xmm0
2235 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
2236 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2237 ; SSE-NEXT: pand %xmm1, %xmm6
2238 ; SSE-NEXT: pandn %xmm0, %xmm1
2239 ; SSE-NEXT: por %xmm6, %xmm1
2240 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7]
2241 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2242 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2243 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
2244 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2245 ; SSE-NEXT: packuswb %xmm0, %xmm0
2246 ; SSE-NEXT: pand %xmm13, %xmm0
2247 ; SSE-NEXT: por %xmm2, %xmm0
2248 ; SSE-NEXT: movdqa 144(%rdi), %xmm12
2249 ; SSE-NEXT: movdqa %xmm12, %xmm2
2250 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2251 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2252 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15]
2253 ; SSE-NEXT: movdqa %xmm12, %xmm1
2254 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2255 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
2256 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
2257 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2258 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2259 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2260 ; SSE-NEXT: packuswb %xmm1, %xmm1
2261 ; SSE-NEXT: movdqa %xmm9, %xmm2
2262 ; SSE-NEXT: pandn %xmm1, %xmm2
2263 ; SSE-NEXT: pand %xmm9, %xmm0
2264 ; SSE-NEXT: por %xmm0, %xmm2
2265 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2266 ; SSE-NEXT: movdqa %xmm15, %xmm0
2267 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2268 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2269 ; SSE-NEXT: pand %xmm15, %xmm1
2270 ; SSE-NEXT: por %xmm0, %xmm1
2271 ; SSE-NEXT: movdqa %xmm1, %xmm0
2272 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2273 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2274 ; SSE-NEXT: movdqa %xmm1, %xmm2
2275 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
2276 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
2277 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
2278 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7]
2279 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
2280 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2281 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
2282 ; SSE-NEXT: psllq $48, %xmm1
2283 ; SSE-NEXT: packuswb %xmm0, %xmm1
2284 ; SSE-NEXT: movdqa %xmm13, %xmm2
2285 ; SSE-NEXT: pandn %xmm1, %xmm2
2286 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2287 ; SSE-NEXT: movdqa %xmm4, %xmm1
2288 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2289 ; SSE-NEXT: pand %xmm13, %xmm1
2290 ; SSE-NEXT: por %xmm11, %xmm1
2291 ; SSE-NEXT: movdqa %xmm1, %xmm6
2292 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2293 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0]
2294 ; SSE-NEXT: movdqa %xmm0, %xmm11
2295 ; SSE-NEXT: pandn %xmm6, %xmm11
2296 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2297 ; SSE-NEXT: pand %xmm0, %xmm1
2298 ; SSE-NEXT: por %xmm11, %xmm1
2299 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2300 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2301 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2302 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2303 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
2304 ; SSE-NEXT: packuswb %xmm1, %xmm1
2305 ; SSE-NEXT: pand %xmm3, %xmm1
2306 ; SSE-NEXT: movdqa %xmm3, %xmm11
2307 ; SSE-NEXT: por %xmm2, %xmm1
2308 ; SSE-NEXT: movdqa %xmm8, %xmm2
2309 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2310 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0]
2311 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
2312 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7]
2313 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
2314 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
2315 ; SSE-NEXT: packuswb %xmm2, %xmm2
2316 ; SSE-NEXT: movdqa %xmm9, %xmm3
2317 ; SSE-NEXT: pandn %xmm2, %xmm3
2318 ; SSE-NEXT: pand %xmm9, %xmm1
2319 ; SSE-NEXT: por %xmm1, %xmm3
2320 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321 ; SSE-NEXT: movdqa %xmm15, %xmm2
2322 ; SSE-NEXT: pandn %xmm10, %xmm2
2323 ; SSE-NEXT: movdqa %xmm7, %xmm1
2324 ; SSE-NEXT: pand %xmm15, %xmm1
2325 ; SSE-NEXT: por %xmm2, %xmm1
2326 ; SSE-NEXT: movdqa %xmm1, %xmm2
2327 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2328 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2329 ; SSE-NEXT: movdqa %xmm1, %xmm6
2330 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0]
2331 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3]
2332 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3]
2333 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7]
2334 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
2335 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2336 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
2337 ; SSE-NEXT: psllq $48, %xmm1
2338 ; SSE-NEXT: packuswb %xmm2, %xmm1
2339 ; SSE-NEXT: movdqa %xmm13, %xmm2
2340 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2341 ; SSE-NEXT: pandn %xmm7, %xmm2
2342 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload
2343 ; SSE-NEXT: movdqa %xmm8, %xmm6
2344 ; SSE-NEXT: pand %xmm13, %xmm6
2345 ; SSE-NEXT: por %xmm2, %xmm6
2346 ; SSE-NEXT: movdqa %xmm6, %xmm2
2347 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2348 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2349 ; SSE-NEXT: pand %xmm0, %xmm6
2350 ; SSE-NEXT: pandn %xmm2, %xmm0
2351 ; SSE-NEXT: por %xmm6, %xmm0
2352 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2353 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2354 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2355 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2356 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
2357 ; SSE-NEXT: packuswb %xmm0, %xmm0
2358 ; SSE-NEXT: movdqa %xmm11, %xmm2
2359 ; SSE-NEXT: pand %xmm11, %xmm0
2360 ; SSE-NEXT: pandn %xmm1, %xmm2
2361 ; SSE-NEXT: por %xmm2, %xmm0
2362 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2363 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
2364 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
2365 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7]
2366 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2367 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4]
2368 ; SSE-NEXT: packuswb %xmm1, %xmm1
2369 ; SSE-NEXT: movdqa %xmm9, %xmm2
2370 ; SSE-NEXT: pandn %xmm1, %xmm2
2371 ; SSE-NEXT: pand %xmm9, %xmm0
2372 ; SSE-NEXT: por %xmm0, %xmm2
2373 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2374 ; SSE-NEXT: pand %xmm14, %xmm4
2375 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2376 ; SSE-NEXT: movdqa %xmm4, %xmm2
2377 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2378 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535]
2379 ; SSE-NEXT: movdqa %xmm3, %xmm6
2380 ; SSE-NEXT: pandn %xmm2, %xmm6
2381 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
2382 ; SSE-NEXT: pand %xmm3, %xmm4
2383 ; SSE-NEXT: por %xmm6, %xmm4
2384 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7]
2385 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2386 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2387 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2388 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2389 ; SSE-NEXT: packuswb %xmm0, %xmm0
2390 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
2391 ; SSE-NEXT: movdqa %xmm1, %xmm2
2392 ; SSE-NEXT: movdqa %xmm1, %xmm10
2393 ; SSE-NEXT: pandn %xmm0, %xmm2
2394 ; SSE-NEXT: movdqa %xmm15, %xmm0
2395 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2396 ; SSE-NEXT: pandn %xmm1, %xmm0
2397 ; SSE-NEXT: movdqa %xmm13, %xmm6
2398 ; SSE-NEXT: movdqa %xmm13, %xmm12
2399 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2400 ; SSE-NEXT: pandn %xmm11, %xmm6
2401 ; SSE-NEXT: movdqa %xmm14, %xmm4
2402 ; SSE-NEXT: pandn %xmm11, %xmm4
2403 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2404 ; SSE-NEXT: pand %xmm15, %xmm11
2405 ; SSE-NEXT: movdqa %xmm15, %xmm4
2406 ; SSE-NEXT: por %xmm0, %xmm11
2407 ; SSE-NEXT: movdqa %xmm11, %xmm0
2408 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2409 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
2410 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0]
2411 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0]
2412 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2]
2413 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2414 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2415 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2416 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2417 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
2418 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5]
2419 ; SSE-NEXT: packuswb %xmm0, %xmm11
2420 ; SSE-NEXT: pand %xmm10, %xmm11
2421 ; SSE-NEXT: por %xmm2, %xmm11
2422 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2423 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
2424 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2425 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
2426 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2427 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2428 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2429 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2430 ; SSE-NEXT: packuswb %xmm0, %xmm0
2431 ; SSE-NEXT: movdqa %xmm9, %xmm2
2432 ; SSE-NEXT: pandn %xmm0, %xmm2
2433 ; SSE-NEXT: pand %xmm9, %xmm11
2434 ; SSE-NEXT: por %xmm11, %xmm2
2435 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2436 ; SSE-NEXT: movdqa %xmm14, %xmm0
2437 ; SSE-NEXT: pandn %xmm7, %xmm0
2438 ; SSE-NEXT: movdqa %xmm8, %xmm15
2439 ; SSE-NEXT: movdqa %xmm8, %xmm2
2440 ; SSE-NEXT: pand %xmm14, %xmm2
2441 ; SSE-NEXT: por %xmm0, %xmm2
2442 ; SSE-NEXT: movdqa %xmm2, %xmm0
2443 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2444 ; SSE-NEXT: movdqa %xmm3, %xmm11
2445 ; SSE-NEXT: pandn %xmm0, %xmm11
2446 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2447 ; SSE-NEXT: pand %xmm3, %xmm2
2448 ; SSE-NEXT: por %xmm11, %xmm2
2449 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
2450 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2451 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2452 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2453 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2454 ; SSE-NEXT: packuswb %xmm0, %xmm0
2455 ; SSE-NEXT: movdqa %xmm10, %xmm13
2456 ; SSE-NEXT: pandn %xmm0, %xmm13
2457 ; SSE-NEXT: movdqa %xmm4, %xmm11
2458 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2459 ; SSE-NEXT: pandn %xmm2, %xmm11
2460 ; SSE-NEXT: movdqa %xmm1, %xmm5
2461 ; SSE-NEXT: movdqa %xmm1, %xmm0
2462 ; SSE-NEXT: movdqa %xmm12, %xmm1
2463 ; SSE-NEXT: pand %xmm12, %xmm0
2464 ; SSE-NEXT: movdqa %xmm2, %xmm7
2465 ; SSE-NEXT: pand %xmm12, %xmm7
2466 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2467 ; SSE-NEXT: pandn %xmm8, %xmm1
2468 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2469 ; SSE-NEXT: pand %xmm14, %xmm5
2470 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2471 ; SSE-NEXT: pand %xmm14, %xmm2
2472 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2473 ; SSE-NEXT: pandn %xmm8, %xmm14
2474 ; SSE-NEXT: pand %xmm4, %xmm8
2475 ; SSE-NEXT: por %xmm11, %xmm8
2476 ; SSE-NEXT: movdqa %xmm8, %xmm11
2477 ; SSE-NEXT: pxor %xmm1, %xmm1
2478 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
2479 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
2480 ; SSE-NEXT: pxor %xmm2, %xmm2
2481 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0]
2482 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0]
2483 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2]
2484 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7]
2485 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
2486 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
2487 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7]
2488 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
2489 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2490 ; SSE-NEXT: packuswb %xmm8, %xmm1
2491 ; SSE-NEXT: pand %xmm10, %xmm1
2492 ; SSE-NEXT: por %xmm13, %xmm1
2493 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2494 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
2495 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2496 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3]
2497 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
2498 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7]
2499 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
2500 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
2501 ; SSE-NEXT: packuswb %xmm8, %xmm11
2502 ; SSE-NEXT: movdqa %xmm9, %xmm12
2503 ; SSE-NEXT: pandn %xmm11, %xmm12
2504 ; SSE-NEXT: pand %xmm9, %xmm1
2505 ; SSE-NEXT: por %xmm1, %xmm12
2506 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2507 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2508 ; SSE-NEXT: pand %xmm13, %xmm1
2509 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2510 ; SSE-NEXT: movdqa %xmm1, %xmm11
2511 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
2512 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2513 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0]
2514 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2515 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
2516 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
2517 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7]
2518 ; SSE-NEXT: packuswb %xmm1, %xmm1
2519 ; SSE-NEXT: movdqa %xmm10, %xmm11
2520 ; SSE-NEXT: pandn %xmm1, %xmm11
2521 ; SSE-NEXT: por %xmm6, %xmm0
2522 ; SSE-NEXT: movdqa %xmm0, %xmm1
2523 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2524 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2525 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2526 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2527 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
2528 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
2529 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
2530 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2531 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2532 ; SSE-NEXT: packuswb %xmm0, %xmm1
2533 ; SSE-NEXT: pand %xmm10, %xmm1
2534 ; SSE-NEXT: por %xmm11, %xmm1
2535 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2536 ; SSE-NEXT: movaps %xmm10, %xmm0
2537 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2538 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0]
2539 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
2540 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7]
2541 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2542 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5]
2543 ; SSE-NEXT: packuswb %xmm0, %xmm6
2544 ; SSE-NEXT: movdqa %xmm9, %xmm8
2545 ; SSE-NEXT: pandn %xmm6, %xmm8
2546 ; SSE-NEXT: pand %xmm9, %xmm1
2547 ; SSE-NEXT: por %xmm1, %xmm8
2548 ; SSE-NEXT: movdqa %xmm13, %xmm0
2549 ; SSE-NEXT: pand %xmm13, %xmm15
2550 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2551 ; SSE-NEXT: por %xmm15, %xmm0
2552 ; SSE-NEXT: movdqa %xmm0, %xmm1
2553 ; SSE-NEXT: pxor %xmm6, %xmm6
2554 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
2555 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2556 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2557 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2558 ; SSE-NEXT: por %xmm7, %xmm2
2559 ; SSE-NEXT: movdqa %xmm2, %xmm1
2560 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
2561 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
2562 ; SSE-NEXT: pxor %xmm13, %xmm13
2563 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0]
2564 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2565 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2566 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2567 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2568 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2569 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
2570 ; SSE-NEXT: packuswb %xmm2, %xmm1
2571 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535]
2572 ; SSE-NEXT: pand %xmm2, %xmm1
2573 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5]
2574 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0]
2575 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7]
2576 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7]
2577 ; SSE-NEXT: packuswb %xmm6, %xmm6
2578 ; SSE-NEXT: pandn %xmm6, %xmm2
2579 ; SSE-NEXT: por %xmm2, %xmm1
2580 ; SSE-NEXT: movdqa %xmm4, %xmm2
2581 ; SSE-NEXT: movdqa %xmm4, %xmm15
2582 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0]
2583 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2]
2584 ; SSE-NEXT: pand %xmm9, %xmm1
2585 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
2586 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2587 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
2588 ; SSE-NEXT: packuswb %xmm2, %xmm2
2589 ; SSE-NEXT: pandn %xmm2, %xmm9
2590 ; SSE-NEXT: por %xmm1, %xmm9
2591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2592 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2593 ; SSE-NEXT: movdqa %xmm0, %xmm1
2594 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
2595 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
2596 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2597 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2598 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2599 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
2600 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
2601 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2602 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
2603 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2604 ; SSE-NEXT: packuswb %xmm1, %xmm2
2605 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
2606 ; SSE-NEXT: movdqa %xmm4, %xmm6
2607 ; SSE-NEXT: pandn %xmm2, %xmm6
2608 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2609 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2610 ; SSE-NEXT: pand %xmm5, %xmm2
2611 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2612 ; SSE-NEXT: movdqa %xmm2, %xmm0
2613 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
2614 ; SSE-NEXT: movdqa %xmm3, %xmm11
2615 ; SSE-NEXT: pandn %xmm0, %xmm11
2616 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
2617 ; SSE-NEXT: pand %xmm3, %xmm2
2618 ; SSE-NEXT: por %xmm11, %xmm2
2619 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7]
2620 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
2621 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2622 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
2623 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2624 ; SSE-NEXT: packuswb %xmm2, %xmm2
2625 ; SSE-NEXT: pand %xmm4, %xmm2
2626 ; SSE-NEXT: por %xmm6, %xmm2
2627 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2628 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
2629 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
2630 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
2631 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7]
2632 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
2633 ; SSE-NEXT: packuswb %xmm1, %xmm10
2634 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1]
2635 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2636 ; SSE-NEXT: movdqa %xmm14, %xmm1
2637 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
2638 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
2639 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3]
2640 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2]
2641 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
2642 ; SSE-NEXT: pand %xmm5, %xmm0
2643 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2644 ; SSE-NEXT: por %xmm0, %xmm5
2645 ; SSE-NEXT: movdqa %xmm5, %xmm1
2646 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
2647 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
2648 ; SSE-NEXT: pand %xmm3, %xmm5
2649 ; SSE-NEXT: pandn %xmm1, %xmm3
2650 ; SSE-NEXT: por %xmm5, %xmm3
2651 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7]
2652 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2653 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2654 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
2655 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2656 ; SSE-NEXT: packuswb %xmm1, %xmm1
2657 ; SSE-NEXT: pand %xmm4, %xmm1
2658 ; SSE-NEXT: movdqa %xmm4, %xmm7
2659 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1]
2660 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7]
2661 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
2662 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
2663 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7]
2664 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7]
2665 ; SSE-NEXT: packuswb %xmm3, %xmm4
2666 ; SSE-NEXT: pandn %xmm4, %xmm7
2667 ; SSE-NEXT: por %xmm7, %xmm1
2668 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2669 ; SSE-NEXT: # xmm4 = mem[3,1,2,3]
2670 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3]
2671 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
2672 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
2673 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2674 ; SSE-NEXT: packuswb %xmm3, %xmm5
2675 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1]
2676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2677 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
2678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2679 ; SSE-NEXT: movaps %xmm0, (%rsi)
2680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2681 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
2682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2683 ; SSE-NEXT: movaps %xmm0, (%rdx)
2684 ; SSE-NEXT: movdqa %xmm12, 16(%rcx)
2685 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2686 ; SSE-NEXT: movaps %xmm0, (%rcx)
2687 ; SSE-NEXT: movdqa %xmm9, 16(%r8)
2688 ; SSE-NEXT: movdqa %xmm8, (%r8)
2689 ; SSE-NEXT: movaps %xmm1, 16(%r9)
2690 ; SSE-NEXT: movaps %xmm2, (%r9)
2691 ; SSE-NEXT: addq $184, %rsp
2694 ; AVX-LABEL: load_i8_stride5_vf32:
2696 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm0
2697 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11]
2698 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
2699 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
2700 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5
2701 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
2702 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u]
2703 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm3
2704 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u]
2705 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
2706 ; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6
2707 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7]
2708 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
2709 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2710 ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6
2711 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
2712 ; AVX-NEXT: vmovdqa (%rdi), %xmm7
2713 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
2714 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm8
2715 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm10
2716 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
2717 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
2718 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
2719 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
2720 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
2721 ; AVX-NEXT: vpor %xmm11, %xmm12, %xmm11
2722 ; AVX-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
2723 ; AVX-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
2724 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
2725 ; AVX-NEXT: vandps %ymm5, %ymm12, %ymm11
2726 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5
2727 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11]
2728 ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14
2729 ; AVX-NEXT: vorps %ymm14, %ymm11, %ymm11
2730 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6
2731 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2732 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12]
2733 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
2734 ; AVX-NEXT: vpor %xmm11, %xmm14, %xmm11
2735 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
2736 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
2737 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2738 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u]
2739 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u]
2740 ; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6
2741 ; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6
2742 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u]
2743 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u]
2744 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14
2745 ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
2746 ; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14
2747 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2748 ; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14
2749 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2,3,4],xmm11[5,6,7]
2750 ; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6
2751 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12]
2752 ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm14
2753 ; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6
2754 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6
2755 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2756 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
2757 ; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm14
2758 ; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6
2759 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3,4,5,6,7]
2760 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
2761 ; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm15
2762 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm14
2763 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5,6,7]
2764 ; AVX-NEXT: vpor %xmm6, %xmm14, %xmm6
2765 ; AVX-NEXT: vandps %ymm6, %ymm12, %ymm6
2766 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
2767 ; AVX-NEXT: vandnps %ymm14, %ymm12, %ymm12
2768 ; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6
2769 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
2770 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
2771 ; AVX-NEXT: vpor %xmm12, %xmm14, %xmm12
2772 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u]
2773 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u]
2774 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2775 ; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm13
2776 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
2777 ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13
2778 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4],xmm12[5,6,7]
2779 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm12
2780 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,1,6,11,128,128,128,128,u,u,u]
2781 ; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm13
2782 ; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6
2783 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3,4,5,6,7]
2784 ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [3,8,13,128,128,128,128,128,128,0,5,10,15,u,u,u]
2785 ; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm14
2786 ; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm13
2787 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3,4,5,6,7]
2788 ; AVX-NEXT: vpor %xmm6, %xmm13, %xmm6
2789 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u]
2790 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2791 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm14
2792 ; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
2793 ; AVX-NEXT: vpshufb %xmm13, %xmm14, %xmm14
2794 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
2795 ; AVX-NEXT: vpor %xmm15, %xmm14, %xmm14
2796 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14]
2797 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
2798 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
2799 ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6
2800 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14
2801 ; AVX-NEXT: vorps %ymm6, %ymm14, %ymm6
2802 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14]
2803 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
2804 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2805 ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm15
2806 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
2807 ; AVX-NEXT: vpblendvb %xmm11, %xmm15, %xmm14, %xmm14
2808 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm6, %ymm6
2809 ; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
2810 ; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm10
2811 ; AVX-NEXT: vpshufb %xmm14, %xmm9, %xmm9
2812 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7]
2813 ; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
2814 ; AVX-NEXT: vpshufb %xmm10, %xmm8, %xmm8
2815 ; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7
2816 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
2817 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
2818 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u]
2819 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2820 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
2821 ; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm2
2822 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
2823 ; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
2824 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero
2825 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15]
2826 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
2827 ; AVX-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0
2828 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
2829 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
2830 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
2831 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
2832 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2833 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
2834 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2835 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
2836 ; AVX-NEXT: vmovaps %ymm12, (%rcx)
2837 ; AVX-NEXT: vmovaps %ymm6, (%r8)
2838 ; AVX-NEXT: vmovaps %ymm0, (%r9)
2839 ; AVX-NEXT: vzeroupper
2842 ; AVX2-LABEL: load_i8_stride5_vf32:
2844 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
2845 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
2846 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
2847 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
2848 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2849 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
2850 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
2851 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
2852 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
2853 ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
2854 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2855 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
2856 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
2857 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
2858 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
2859 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2860 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
2861 ; AVX2-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
2862 ; AVX2-NEXT: vmovdqa %xmm8, %xmm7
2863 ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
2864 ; AVX2-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
2865 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2866 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
2867 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2868 ; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5
2869 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2870 ; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
2871 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2872 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
2873 ; AVX2-NEXT: # ymm12 = mem[0,1,0,1]
2874 ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2875 ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
2876 ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
2877 ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2878 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
2879 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
2880 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
2881 ; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9
2882 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2883 ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
2884 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2885 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
2886 ; AVX2-NEXT: # ymm13 = mem[0,1,0,1]
2887 ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
2888 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
2889 ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
2890 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
2891 ; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
2892 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2893 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
2894 ; AVX2-NEXT: # ymm13 = mem[0,1,0,1]
2895 ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
2896 ; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
2897 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
2898 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm10
2899 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
2900 ; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10
2901 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
2902 ; AVX2-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
2903 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm8
2904 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
2905 ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2906 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm3
2907 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
2908 ; AVX2-NEXT: vpor %xmm4, %xmm11, %xmm4
2909 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2910 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
2911 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
2912 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2913 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
2914 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
2915 ; AVX2-NEXT: # ymm2 = mem[0,1,0,1]
2916 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
2917 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
2918 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
2919 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
2920 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2921 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
2922 ; AVX2-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
2923 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm6
2924 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
2925 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
2926 ; AVX2-NEXT: vpor %xmm6, %xmm9, %xmm6
2927 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm9
2928 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
2929 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
2930 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
2931 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
2932 ; AVX2-NEXT: vpermd %ymm6, %ymm9, %ymm6
2933 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
2934 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
2935 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
2936 ; AVX2-NEXT: vpor %xmm2, %xmm6, %xmm2
2937 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2938 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
2939 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2940 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
2941 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
2942 ; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
2943 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2944 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
2945 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
2946 ; AVX2-NEXT: vmovdqa %ymm4, (%rsi)
2947 ; AVX2-NEXT: vmovdqa %ymm2, (%rdx)
2948 ; AVX2-NEXT: vmovdqa %ymm3, (%rcx)
2949 ; AVX2-NEXT: vmovdqa %ymm1, (%r8)
2950 ; AVX2-NEXT: vmovdqa %ymm0, (%r9)
2951 ; AVX2-NEXT: vzeroupper
2954 ; AVX2-FP-LABEL: load_i8_stride5_vf32:
2956 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
2957 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
2958 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
2959 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
2960 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2961 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
2962 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
2963 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
2964 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
2965 ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5
2966 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2967 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
2968 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
2969 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
2970 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
2971 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
2972 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
2973 ; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
2974 ; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm7
2975 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
2976 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
2977 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
2978 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
2979 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
2980 ; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5
2981 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
2982 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
2983 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
2984 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
2985 ; AVX2-FP-NEXT: # ymm12 = mem[0,1,0,1]
2986 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
2987 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
2988 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
2989 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
2990 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10
2991 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
2992 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
2993 ; AVX2-FP-NEXT: vpor %xmm10, %xmm9, %xmm9
2994 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
2995 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
2996 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
2997 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
2998 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1]
2999 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3000 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3001 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
3002 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
3003 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
3004 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3005 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
3006 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1]
3007 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3008 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
3009 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3010 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm10
3011 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3012 ; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10
3013 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3014 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
3015 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm8
3016 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
3017 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3018 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm3
3019 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3020 ; AVX2-FP-NEXT: vpor %xmm4, %xmm11, %xmm4
3021 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3022 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
3023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3024 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3025 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3026 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
3027 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1]
3028 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3029 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
3030 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3031 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
3032 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3033 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3034 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
3035 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm6
3036 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
3037 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
3038 ; AVX2-FP-NEXT: vpor %xmm6, %xmm9, %xmm6
3039 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9
3040 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3041 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
3042 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3043 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
3044 ; AVX2-FP-NEXT: vpermd %ymm6, %ymm9, %ymm6
3045 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
3046 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
3047 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3048 ; AVX2-FP-NEXT: vpor %xmm2, %xmm6, %xmm2
3049 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3050 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
3051 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
3052 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
3053 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3054 ; AVX2-FP-NEXT: vpor %xmm5, %xmm3, %xmm3
3055 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3056 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
3057 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3058 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi)
3059 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rdx)
3060 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%rcx)
3061 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
3062 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9)
3063 ; AVX2-FP-NEXT: vzeroupper
3064 ; AVX2-FP-NEXT: retq
3066 ; AVX2-FCP-LABEL: load_i8_stride5_vf32:
3067 ; AVX2-FCP: # %bb.0:
3068 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
3069 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
3070 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3071 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3072 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
3073 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
3074 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3075 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3076 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3077 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3078 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
3079 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
3080 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
3081 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
3082 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3083 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
3084 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3085 ; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} ymm8 = [4294967295,4294967295,4294967295,255,0,0,0,0]
3086 ; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm7
3087 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
3088 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
3089 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
3090 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
3091 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
3092 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
3093 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
3094 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
3095 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
3096 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
3097 ; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1]
3098 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
3099 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3100 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm10, %ymm5
3101 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3102 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
3103 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3104 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3105 ; AVX2-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
3106 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
3107 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
3108 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3109 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
3110 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1]
3111 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3112 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3113 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
3114 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
3115 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
3116 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3117 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
3118 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1]
3119 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
3120 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm10
3121 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3122 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
3123 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3124 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
3125 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3126 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm10, %ymm11, %ymm10
3127 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm8
3128 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[1,6,11]
3129 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm4, %ymm3, %ymm9
3130 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm3
3131 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3132 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm11, %xmm4
3133 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3134 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7],ymm6[8,9,10,11,12],ymm4[13,14,15]
3135 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3136 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3137 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3138 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
3139 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
3140 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
3141 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[4,9,14]
3142 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3143 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
3144 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3145 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3146 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1
3147 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm6
3148 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
3149 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero,xmm9[u,u,u,u]
3150 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6
3151 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
3152 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3153 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
3154 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm9[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3155 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [0,5,0,5,0,5,0,5]
3156 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6
3157 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm6, %ymm0
3158 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[2,7,12]
3159 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3160 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2
3161 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3162 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7],ymm5[8,9,10,11,12],ymm2[13,14,15]
3163 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
3164 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[3,8,13]
3165 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3166 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
3167 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3168 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7],ymm7[8,9,10,11,12],ymm3[13,14,15]
3169 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3170 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3171 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rdx)
3172 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rcx)
3173 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
3174 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9)
3175 ; AVX2-FCP-NEXT: vzeroupper
3176 ; AVX2-FCP-NEXT: retq
3178 ; AVX512-LABEL: load_i8_stride5_vf32:
3180 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3181 ; AVX512-NEXT: vmovdqa (%rdi), %ymm3
3182 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
3183 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0
3184 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1
3185 ; AVX512-NEXT: vmovdqa %ymm2, %ymm4
3186 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3187 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3188 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
3189 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3190 ; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm6
3191 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3192 ; AVX512-NEXT: vmovdqa %ymm4, %ymm7
3193 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm7
3194 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9
3195 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3196 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3197 ; AVX512-NEXT: vpor %xmm7, %xmm9, %xmm9
3198 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3199 ; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm6, %ymm9
3200 ; AVX512-NEXT: vmovdqa 144(%rdi), %xmm7
3201 ; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm6
3202 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm8
3203 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3204 ; AVX512-NEXT: vpor %xmm6, %xmm10, %xmm6
3205 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3206 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3207 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3208 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3209 ; AVX512-NEXT: vmovdqa %ymm10, %ymm9
3210 ; AVX512-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm9
3211 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3212 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12
3213 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3214 ; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3215 ; AVX512-NEXT: vmovdqa %ymm2, %ymm13
3216 ; AVX512-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm13
3217 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3218 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13
3219 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3220 ; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13
3221 ; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm12, %ymm13
3222 ; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm9
3223 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3224 ; AVX512-NEXT: vpor %xmm9, %xmm12, %xmm9
3225 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
3226 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3227 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3228 ; AVX512-NEXT: vmovdqa %ymm4, %ymm12
3229 ; AVX512-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm12
3230 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3231 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3232 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3233 ; AVX512-NEXT: vpshufb %ymm12, %ymm13, %ymm13
3234 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm10
3235 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14
3236 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3237 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3238 ; AVX512-NEXT: vpor %xmm14, %xmm10, %xmm10
3239 ; AVX512-NEXT: vpternlogq $236, %ymm11, %ymm13, %ymm10
3240 ; AVX512-NEXT: vpshufb %xmm12, %xmm7, %xmm11
3241 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3242 ; AVX512-NEXT: vpor %xmm11, %xmm12, %xmm11
3243 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3244 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3245 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3246 ; AVX512-NEXT: vmovdqa %ymm2, %ymm11
3247 ; AVX512-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm11
3248 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3249 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
3250 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3251 ; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm12
3252 ; AVX512-NEXT: vmovdqa %ymm4, %ymm13
3253 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm13
3254 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3255 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13
3256 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3257 ; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13
3258 ; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3259 ; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
3260 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3261 ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
3262 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3263 ; AVX512-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3264 ; AVX512-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7
3265 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2
3266 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
3267 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3268 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3269 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
3270 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3271 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3272 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0
3273 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3274 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3275 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1
3276 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3277 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3278 ; AVX512-NEXT: vpermd %ymm1, %ymm2, %ymm1
3279 ; AVX512-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
3280 ; AVX512-NEXT: vmovdqa %ymm6, (%rsi)
3281 ; AVX512-NEXT: vmovdqa %ymm9, (%rdx)
3282 ; AVX512-NEXT: vmovdqa %ymm10, (%rcx)
3283 ; AVX512-NEXT: vmovdqa %ymm7, (%r8)
3284 ; AVX512-NEXT: vmovdqa %ymm1, (%r9)
3285 ; AVX512-NEXT: vzeroupper
3288 ; AVX512-FCP-LABEL: load_i8_stride5_vf32:
3289 ; AVX512-FCP: # %bb.0:
3290 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3291 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3
3292 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
3293 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3294 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3295 ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm4
3296 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3297 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3298 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
3299 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3300 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6
3301 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3302 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7
3303 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm7
3304 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
3305 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3306 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3307 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9
3308 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3309 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm6, %ymm9
3310 ; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm7
3311 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm6
3312 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm8
3313 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3314 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
3315 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3316 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3317 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3318 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3319 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm9
3320 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm9
3321 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3322 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12
3323 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3324 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3325 ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm13
3326 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm13
3327 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3328 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
3329 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3330 ; AVX512-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
3331 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm12, %ymm13
3332 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9
3333 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3334 ; AVX512-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9
3335 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
3336 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3337 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3338 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12
3339 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm12
3340 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3341 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3342 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3343 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13
3344 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm10
3345 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
3346 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3347 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3348 ; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10
3349 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm11, %ymm13, %ymm10
3350 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11
3351 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3352 ; AVX512-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
3353 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3354 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3355 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3356 ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm11
3357 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm11
3358 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3359 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
3360 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3361 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12
3362 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm13
3363 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm13
3364 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3365 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
3366 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3367 ; AVX512-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
3368 ; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3369 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7
3370 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3371 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3372 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3373 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3374 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7
3375 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2
3376 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
3377 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3378 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3379 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
3380 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3381 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3382 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0
3383 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3384 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3385 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
3386 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3387 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3388 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
3389 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
3390 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi)
3391 ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rdx)
3392 ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rcx)
3393 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8)
3394 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
3395 ; AVX512-FCP-NEXT: vzeroupper
3396 ; AVX512-FCP-NEXT: retq
3398 ; AVX512DQ-LABEL: load_i8_stride5_vf32:
3399 ; AVX512DQ: # %bb.0:
3400 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3401 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
3402 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
3403 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0
3404 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1
3405 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm4
3406 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3407 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3408 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
3409 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3410 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm6
3411 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3412 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7
3413 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm7
3414 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm9
3415 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3416 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3417 ; AVX512DQ-NEXT: vpor %xmm7, %xmm9, %xmm9
3418 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3419 ; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm6, %ymm9
3420 ; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm7
3421 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm6
3422 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm8
3423 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3424 ; AVX512DQ-NEXT: vpor %xmm6, %xmm10, %xmm6
3425 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3426 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3427 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3428 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3429 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm9
3430 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm9
3431 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3432 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12
3433 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3434 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3435 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm13
3436 ; AVX512DQ-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm13
3437 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3438 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13
3439 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3440 ; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13
3441 ; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm12, %ymm13
3442 ; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm9
3443 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3444 ; AVX512DQ-NEXT: vpor %xmm9, %xmm12, %xmm9
3445 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
3446 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3447 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3448 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm12
3449 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm12
3450 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3451 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3452 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3453 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm13, %ymm13
3454 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm10
3455 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm14
3456 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3457 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3458 ; AVX512DQ-NEXT: vpor %xmm14, %xmm10, %xmm10
3459 ; AVX512DQ-NEXT: vpternlogq $236, %ymm11, %ymm13, %ymm10
3460 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm7, %xmm11
3461 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3462 ; AVX512DQ-NEXT: vpor %xmm11, %xmm12, %xmm11
3463 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3464 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3465 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3466 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11
3467 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm11
3468 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3469 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
3470 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3471 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm12
3472 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm13
3473 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm13
3474 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3475 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13
3476 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3477 ; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13
3478 ; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3479 ; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
3480 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3481 ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7
3482 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3483 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3484 ; AVX512DQ-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7
3485 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2
3486 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
3487 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3488 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3489 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
3490 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3491 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3492 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0
3493 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3494 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3495 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1
3496 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3497 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3498 ; AVX512DQ-NEXT: vpermd %ymm1, %ymm2, %ymm1
3499 ; AVX512DQ-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
3500 ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rsi)
3501 ; AVX512DQ-NEXT: vmovdqa %ymm9, (%rdx)
3502 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%rcx)
3503 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8)
3504 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
3505 ; AVX512DQ-NEXT: vzeroupper
3506 ; AVX512DQ-NEXT: retq
3508 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf32:
3509 ; AVX512DQ-FCP: # %bb.0:
3510 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3511 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3
3512 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
3513 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3514 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3515 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm4
3516 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3517 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
3518 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
3519 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
3520 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6
3521 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3522 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7
3523 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm7
3524 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
3525 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
3526 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
3527 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm9, %xmm9
3528 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3529 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm6, %ymm9
3530 ; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm7
3531 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm6
3532 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm8
3533 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3534 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
3535 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3536 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
3537 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3538 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3539 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm9
3540 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm9
3541 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm9[2,3,0,1]
3542 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm12
3543 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
3544 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
3545 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm13
3546 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm3, %ymm13
3547 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
3548 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
3549 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
3550 ; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
3551 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm12, %ymm13
3552 ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9
3553 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3554 ; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9
3555 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
3556 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5,6,7],ymm13[8,9,10,11,12],ymm9[13,14,15]
3557 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
3558 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm12
3559 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm12
3560 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
3561 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3562 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
3563 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm13
3564 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm10
3565 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
3566 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14,u,u,u]
3567 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero,xmm10[u,u,u]
3568 ; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm10
3569 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm11, %ymm13, %ymm10
3570 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm11
3571 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3572 ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
3573 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3574 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
3575 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3576 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11
3577 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm11
3578 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3579 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
3580 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3581 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm12
3582 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm13
3583 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm13
3584 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[u,u,u]
3585 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
3586 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[2,7,12],zero,zero,zero,xmm13[0,5,10,15,u,u,u]
3587 ; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
3588 ; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
3589 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7
3590 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3591 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3592 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3593 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm8 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
3594 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm13, %ymm8, %ymm7
3595 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm5, %ymm2
3596 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
3597 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3598 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3599 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
3600 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
3601 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,3,0,1]
3602 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm0
3603 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3604 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3605 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
3606 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3607 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3608 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
3609 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
3610 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi)
3611 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rdx)
3612 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rcx)
3613 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8)
3614 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
3615 ; AVX512DQ-FCP-NEXT: vzeroupper
3616 ; AVX512DQ-FCP-NEXT: retq
3618 ; AVX512BW-LABEL: load_i8_stride5_vf32:
3619 ; AVX512BW: # %bb.0:
3620 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
3621 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
3622 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0
3623 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1
3624 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
3625 ; AVX512BW-NEXT: kmovd %eax, %k1
3626 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3627 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3628 ; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
3629 ; AVX512BW-NEXT: kmovd %eax, %k2
3630 ; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2}
3631 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
3632 ; AVX512BW-NEXT: kmovd %eax, %k2
3633 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3634 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
3635 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3636 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3637 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
3638 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
3639 ; AVX512BW-NEXT: kmovd %eax, %k3
3640 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3641 ; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm6
3642 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3643 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm7
3644 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3645 ; AVX512BW-NEXT: vpor %xmm4, %xmm8, %xmm4
3646 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3647 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3648 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3649 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
3650 ; AVX512BW-NEXT: kmovd %eax, %k4
3651 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3652 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3653 ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
3654 ; AVX512BW-NEXT: kmovd %eax, %k5
3655 ; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5}
3656 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3657 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3658 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8
3659 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3660 ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3661 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3662 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3663 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3664 ; AVX512BW-NEXT: vpor %xmm5, %xmm9, %xmm5
3665 ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3666 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3667 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3668 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3669 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3670 ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000
3671 ; AVX512BW-NEXT: kmovd %eax, %k5
3672 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5}
3673 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3674 ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10
3675 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3676 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3677 ; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9
3678 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3679 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3680 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3681 ; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8
3682 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3683 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3684 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3685 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3686 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3687 ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000
3688 ; AVX512BW-NEXT: kmovd %eax, %k3
3689 ; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3}
3690 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3691 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3692 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10
3693 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3694 ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3695 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
3696 ; AVX512BW-NEXT: kmovd %eax, %k3
3697 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3698 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3699 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3700 ; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
3701 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3702 ; AVX512BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000
3703 ; AVX512BW-NEXT: kmovd %eax, %k3
3704 ; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3}
3705 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1}
3706 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
3707 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3708 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3709 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
3710 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2}
3711 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3712 ; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000
3713 ; AVX512BW-NEXT: kmovd %eax, %k1
3714 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
3715 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3716 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3717 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1
3718 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3719 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3720 ; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm1
3721 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
3722 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3723 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3724 ; AVX512BW-NEXT: vmovdqa %ymm8, (%rcx)
3725 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r8)
3726 ; AVX512BW-NEXT: vmovdqa %ymm0, (%r9)
3727 ; AVX512BW-NEXT: vzeroupper
3728 ; AVX512BW-NEXT: retq
3730 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf32:
3731 ; AVX512BW-FCP: # %bb.0:
3732 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
3733 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
3734 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3735 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3736 ; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
3737 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
3738 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3739 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3740 ; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000
3741 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
3742 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2}
3743 ; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
3744 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
3745 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3746 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3747 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3748 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3749 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3750 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
3751 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
3752 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3753 ; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm6
3754 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3755 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm7
3756 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3757 ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4
3758 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3759 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3760 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3761 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
3762 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4
3763 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3764 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3765 ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
3766 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
3767 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5}
3768 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3769 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3770 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
3771 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3772 ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3773 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3774 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3775 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3776 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
3777 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3778 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3779 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3780 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3781 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3782 ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
3783 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
3784 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5}
3785 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3786 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
3787 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3788 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3789 ; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
3790 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3791 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3792 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3793 ; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
3794 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3795 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3796 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3797 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3798 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3799 ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
3800 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
3801 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3}
3802 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3803 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3804 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
3805 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3806 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3807 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
3808 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
3809 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3810 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3811 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3812 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
3813 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3814 ; AVX512BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000
3815 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
3816 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3}
3817 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1}
3818 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
3819 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3820 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3821 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
3822 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2}
3823 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3824 ; AVX512BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000
3825 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
3826 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
3827 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3828 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3829 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
3830 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3831 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3832 ; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
3833 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
3834 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3835 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3836 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rcx)
3837 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r8)
3838 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
3839 ; AVX512BW-FCP-NEXT: vzeroupper
3840 ; AVX512BW-FCP-NEXT: retq
3842 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf32:
3843 ; AVX512DQ-BW: # %bb.0:
3844 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3
3845 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
3846 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0
3847 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm1
3848 ; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294
3849 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
3850 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3851 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3852 ; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
3853 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
3854 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2}
3855 ; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52
3856 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
3857 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3858 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
3859 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3860 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3861 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
3862 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
3863 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
3864 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3865 ; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm6
3866 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3867 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm7
3868 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3869 ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm8, %xmm4
3870 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3871 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3872 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3873 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A
3874 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4
3875 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3876 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3877 ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
3878 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
3879 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5}
3880 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3881 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3882 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8
3883 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3884 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3885 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3886 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3887 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3888 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm9, %xmm5
3889 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3890 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
3891 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
3892 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
3893 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
3894 ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000
3895 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
3896 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5}
3897 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
3898 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm10
3899 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
3900 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
3901 ; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm9, %xmm9
3902 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
3903 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
3904 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3905 ; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm10, %xmm8
3906 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3907 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
3908 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3909 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
3910 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
3911 ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000
3912 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
3913 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3}
3914 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
3915 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
3916 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm10
3917 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
3918 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3919 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
3920 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
3921 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3922 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3923 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3924 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm7, %xmm6
3925 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3926 ; AVX512DQ-BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000
3927 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
3928 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3}
3929 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1}
3930 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
3931 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3932 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
3933 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
3934 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2}
3935 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3936 ; AVX512DQ-BW-NEXT: movl $554172416, %eax # imm = 0x21080000
3937 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
3938 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
3939 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
3940 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
3941 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1
3942 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
3943 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
3944 ; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm2, %ymm1
3945 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
3946 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi)
3947 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx)
3948 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rcx)
3949 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r8)
3950 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r9)
3951 ; AVX512DQ-BW-NEXT: vzeroupper
3952 ; AVX512DQ-BW-NEXT: retq
3954 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf32:
3955 ; AVX512DQ-BW-FCP: # %bb.0:
3956 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
3957 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
3958 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3959 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3960 ; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
3961 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
3962 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
3963 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
3964 ; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000
3965 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
3966 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2}
3967 ; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
3968 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
3969 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
3970 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3971 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3972 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3973 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3974 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
3975 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
3976 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
3977 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm6
3978 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
3979 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm7
3980 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3981 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm8, %xmm4
3982 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3983 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3984 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3985 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
3986 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
3987 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
3988 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
3989 ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
3990 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
3991 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5}
3992 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
3993 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
3994 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
3995 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
3996 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3997 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
3998 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
3999 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
4000 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
4001 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
4002 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
4003 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
4004 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
4005 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
4006 ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
4007 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
4008 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5}
4009 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
4010 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
4011 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
4012 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
4013 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
4014 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
4015 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
4016 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
4017 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
4018 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
4019 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
4020 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
4021 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
4022 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
4023 ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
4024 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
4025 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3}
4026 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
4027 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
4028 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
4029 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
4030 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
4031 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
4032 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
4033 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
4034 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
4035 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
4036 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
4037 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
4038 ; AVX512DQ-BW-FCP-NEXT: movl $-33554432, %eax # imm = 0xFE000000
4039 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
4040 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3}
4041 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1}
4042 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
4043 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
4044 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
4045 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
4046 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2}
4047 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4048 ; AVX512DQ-BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000
4049 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
4050 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
4051 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
4052 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
4053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
4054 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
4055 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
4056 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
4057 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
4058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
4059 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
4060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rcx)
4061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r8)
4062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
4063 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4064 ; AVX512DQ-BW-FCP-NEXT: retq
4065 %wide.vec = load <160 x i8>, ptr %in.vec, align 64
4066 %strided.vec0 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155>
4067 %strided.vec1 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156>
4068 %strided.vec2 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157>
4069 %strided.vec3 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158>
4070 %strided.vec4 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159>
4071 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
4072 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
4073 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
4074 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
4075 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
4079 define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
4080 ; SSE-LABEL: load_i8_stride5_vf64:
4082 ; SSE-NEXT: subq $552, %rsp # imm = 0x228
4083 ; SSE-NEXT: movdqa 160(%rdi), %xmm9
4084 ; SSE-NEXT: movdqa 176(%rdi), %xmm3
4085 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4086 ; SSE-NEXT: movdqa 208(%rdi), %xmm4
4087 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4088 ; SSE-NEXT: movdqa 192(%rdi), %xmm1
4089 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4090 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4091 ; SSE-NEXT: movdqa %xmm2, %xmm0
4092 ; SSE-NEXT: pandn %xmm1, %xmm0
4093 ; SSE-NEXT: movdqa %xmm4, %xmm1
4094 ; SSE-NEXT: pand %xmm2, %xmm1
4095 ; SSE-NEXT: movdqa %xmm2, %xmm14
4096 ; SSE-NEXT: por %xmm0, %xmm1
4097 ; SSE-NEXT: pxor %xmm12, %xmm12
4098 ; SSE-NEXT: movdqa %xmm1, %xmm0
4099 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4100 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4101 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4102 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
4103 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4104 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
4105 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4106 ; SSE-NEXT: packuswb %xmm1, %xmm0
4107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
4108 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
4109 ; SSE-NEXT: movdqa %xmm11, %xmm1
4110 ; SSE-NEXT: pandn %xmm0, %xmm1
4111 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
4112 ; SSE-NEXT: movdqa %xmm10, %xmm0
4113 ; SSE-NEXT: pandn %xmm3, %xmm0
4114 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4115 ; SSE-NEXT: movdqa %xmm2, %xmm3
4116 ; SSE-NEXT: movdqa %xmm2, %xmm4
4117 ; SSE-NEXT: pandn %xmm9, %xmm3
4118 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4119 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4120 ; SSE-NEXT: movdqa %xmm7, %xmm3
4121 ; SSE-NEXT: pandn %xmm9, %xmm3
4122 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4123 ; SSE-NEXT: movdqa %xmm14, %xmm2
4124 ; SSE-NEXT: pandn %xmm9, %xmm2
4125 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4126 ; SSE-NEXT: movdqa %xmm10, %xmm2
4127 ; SSE-NEXT: pandn %xmm9, %xmm2
4128 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4129 ; SSE-NEXT: pand %xmm10, %xmm9
4130 ; SSE-NEXT: por %xmm0, %xmm9
4131 ; SSE-NEXT: movdqa %xmm9, %xmm0
4132 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4133 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535]
4134 ; SSE-NEXT: movdqa %xmm8, %xmm2
4135 ; SSE-NEXT: pandn %xmm0, %xmm2
4136 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
4137 ; SSE-NEXT: pand %xmm8, %xmm9
4138 ; SSE-NEXT: por %xmm2, %xmm9
4139 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7]
4140 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
4141 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4142 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
4143 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
4144 ; SSE-NEXT: packuswb %xmm0, %xmm0
4145 ; SSE-NEXT: pand %xmm11, %xmm0
4146 ; SSE-NEXT: por %xmm1, %xmm0
4147 ; SSE-NEXT: movdqa 224(%rdi), %xmm3
4148 ; SSE-NEXT: movdqa %xmm3, %xmm2
4149 ; SSE-NEXT: pxor %xmm1, %xmm1
4150 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4151 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4152 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4153 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4154 ; SSE-NEXT: pxor %xmm9, %xmm9
4155 ; SSE-NEXT: movdqa %xmm3, %xmm1
4156 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
4157 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
4158 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
4159 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4160 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
4161 ; SSE-NEXT: packuswb %xmm1, %xmm1
4162 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4163 ; SSE-NEXT: movdqa %xmm6, %xmm2
4164 ; SSE-NEXT: pandn %xmm1, %xmm2
4165 ; SSE-NEXT: pand %xmm6, %xmm0
4166 ; SSE-NEXT: por %xmm0, %xmm2
4167 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4168 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
4169 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4170 ; SSE-NEXT: movdqa %xmm14, %xmm0
4171 ; SSE-NEXT: pandn %xmm1, %xmm0
4172 ; SSE-NEXT: movdqa 48(%rdi), %xmm15
4173 ; SSE-NEXT: movdqa %xmm15, %xmm1
4174 ; SSE-NEXT: pand %xmm14, %xmm1
4175 ; SSE-NEXT: por %xmm0, %xmm1
4176 ; SSE-NEXT: movdqa %xmm1, %xmm0
4177 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4178 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
4179 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4180 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4181 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4182 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
4183 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4184 ; SSE-NEXT: packuswb %xmm1, %xmm0
4185 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
4186 ; SSE-NEXT: movdqa %xmm11, %xmm1
4187 ; SSE-NEXT: pandn %xmm0, %xmm1
4188 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
4189 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4190 ; SSE-NEXT: movdqa %xmm10, %xmm2
4191 ; SSE-NEXT: pandn %xmm0, %xmm2
4192 ; SSE-NEXT: movdqa (%rdi), %xmm3
4193 ; SSE-NEXT: movdqa %xmm4, %xmm0
4194 ; SSE-NEXT: pandn %xmm3, %xmm4
4195 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4196 ; SSE-NEXT: movdqa %xmm7, %xmm4
4197 ; SSE-NEXT: pandn %xmm3, %xmm4
4198 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4199 ; SSE-NEXT: movdqa %xmm14, %xmm4
4200 ; SSE-NEXT: pandn %xmm3, %xmm4
4201 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4202 ; SSE-NEXT: movdqa %xmm10, %xmm4
4203 ; SSE-NEXT: pandn %xmm3, %xmm4
4204 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4205 ; SSE-NEXT: pand %xmm10, %xmm3
4206 ; SSE-NEXT: por %xmm2, %xmm3
4207 ; SSE-NEXT: movdqa %xmm3, %xmm2
4208 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4209 ; SSE-NEXT: movdqa %xmm8, %xmm4
4210 ; SSE-NEXT: pandn %xmm2, %xmm4
4211 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4212 ; SSE-NEXT: pand %xmm8, %xmm3
4213 ; SSE-NEXT: por %xmm4, %xmm3
4214 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7]
4215 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4216 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
4217 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
4218 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4219 ; SSE-NEXT: packuswb %xmm2, %xmm2
4220 ; SSE-NEXT: pand %xmm11, %xmm2
4221 ; SSE-NEXT: por %xmm1, %xmm2
4222 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
4223 ; SSE-NEXT: movdqa %xmm1, %xmm3
4224 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4225 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4226 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4227 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4228 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0]
4229 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3]
4230 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
4231 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
4232 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
4233 ; SSE-NEXT: packuswb %xmm1, %xmm1
4234 ; SSE-NEXT: movdqa %xmm6, %xmm3
4235 ; SSE-NEXT: pandn %xmm1, %xmm3
4236 ; SSE-NEXT: pand %xmm6, %xmm2
4237 ; SSE-NEXT: por %xmm2, %xmm3
4238 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4239 ; SSE-NEXT: movdqa 272(%rdi), %xmm2
4240 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4241 ; SSE-NEXT: movdqa %xmm14, %xmm1
4242 ; SSE-NEXT: pandn %xmm2, %xmm1
4243 ; SSE-NEXT: movdqa 288(%rdi), %xmm13
4244 ; SSE-NEXT: movdqa %xmm13, %xmm2
4245 ; SSE-NEXT: pand %xmm14, %xmm2
4246 ; SSE-NEXT: por %xmm1, %xmm2
4247 ; SSE-NEXT: movdqa %xmm2, %xmm1
4248 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4249 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
4250 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
4251 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4252 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4253 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
4254 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4255 ; SSE-NEXT: packuswb %xmm2, %xmm1
4256 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
4257 ; SSE-NEXT: movdqa %xmm11, %xmm2
4258 ; SSE-NEXT: pandn %xmm1, %xmm2
4259 ; SSE-NEXT: movdqa 256(%rdi), %xmm1
4260 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
4261 ; SSE-NEXT: movdqa %xmm10, %xmm4
4262 ; SSE-NEXT: pandn %xmm1, %xmm4
4263 ; SSE-NEXT: movdqa 240(%rdi), %xmm3
4264 ; SSE-NEXT: movdqa %xmm0, %xmm1
4265 ; SSE-NEXT: pandn %xmm3, %xmm1
4266 ; SSE-NEXT: pandn %xmm3, %xmm7
4267 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4268 ; SSE-NEXT: movdqa %xmm14, %xmm7
4269 ; SSE-NEXT: pandn %xmm3, %xmm7
4270 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4271 ; SSE-NEXT: movdqa %xmm10, %xmm7
4272 ; SSE-NEXT: pandn %xmm3, %xmm7
4273 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4274 ; SSE-NEXT: pand %xmm10, %xmm3
4275 ; SSE-NEXT: por %xmm4, %xmm3
4276 ; SSE-NEXT: movdqa %xmm3, %xmm4
4277 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4278 ; SSE-NEXT: movdqa %xmm8, %xmm7
4279 ; SSE-NEXT: pandn %xmm4, %xmm7
4280 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4281 ; SSE-NEXT: pand %xmm8, %xmm3
4282 ; SSE-NEXT: por %xmm7, %xmm3
4283 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
4284 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
4285 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
4286 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
4287 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
4288 ; SSE-NEXT: packuswb %xmm3, %xmm3
4289 ; SSE-NEXT: pand %xmm11, %xmm3
4290 ; SSE-NEXT: por %xmm2, %xmm3
4291 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
4292 ; SSE-NEXT: movdqa %xmm2, %xmm4
4293 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4294 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4295 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4296 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4297 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0]
4298 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3]
4299 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4300 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
4301 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
4302 ; SSE-NEXT: packuswb %xmm2, %xmm2
4303 ; SSE-NEXT: movdqa %xmm6, %xmm4
4304 ; SSE-NEXT: pandn %xmm2, %xmm4
4305 ; SSE-NEXT: pand %xmm6, %xmm3
4306 ; SSE-NEXT: por %xmm3, %xmm4
4307 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4308 ; SSE-NEXT: movdqa 112(%rdi), %xmm3
4309 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4310 ; SSE-NEXT: movdqa %xmm14, %xmm2
4311 ; SSE-NEXT: pandn %xmm3, %xmm2
4312 ; SSE-NEXT: movdqa 128(%rdi), %xmm3
4313 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4314 ; SSE-NEXT: pand %xmm14, %xmm3
4315 ; SSE-NEXT: por %xmm2, %xmm3
4316 ; SSE-NEXT: movdqa %xmm3, %xmm2
4317 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4318 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3]
4319 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4320 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4321 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
4322 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
4323 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4324 ; SSE-NEXT: packuswb %xmm3, %xmm2
4325 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3]
4326 ; SSE-NEXT: movdqa %xmm11, %xmm3
4327 ; SSE-NEXT: pandn %xmm2, %xmm3
4328 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
4329 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4330 ; SSE-NEXT: movdqa %xmm10, %xmm2
4331 ; SSE-NEXT: pandn %xmm4, %xmm2
4332 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
4333 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4334 ; SSE-NEXT: pand %xmm10, %xmm4
4335 ; SSE-NEXT: por %xmm2, %xmm4
4336 ; SSE-NEXT: movdqa %xmm4, %xmm2
4337 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
4338 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
4339 ; SSE-NEXT: pand %xmm8, %xmm4
4340 ; SSE-NEXT: pandn %xmm2, %xmm8
4341 ; SSE-NEXT: por %xmm4, %xmm8
4342 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7]
4343 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4344 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
4345 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
4346 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
4347 ; SSE-NEXT: packuswb %xmm2, %xmm2
4348 ; SSE-NEXT: pand %xmm11, %xmm2
4349 ; SSE-NEXT: por %xmm3, %xmm2
4350 ; SSE-NEXT: movdqa 144(%rdi), %xmm12
4351 ; SSE-NEXT: movdqa %xmm12, %xmm4
4352 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4353 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4354 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
4355 ; SSE-NEXT: movdqa %xmm12, %xmm3
4356 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4357 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
4358 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3]
4359 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4360 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
4361 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
4362 ; SSE-NEXT: packuswb %xmm3, %xmm3
4363 ; SSE-NEXT: movdqa %xmm6, %xmm14
4364 ; SSE-NEXT: movdqa %xmm6, %xmm4
4365 ; SSE-NEXT: pandn %xmm3, %xmm4
4366 ; SSE-NEXT: pand %xmm6, %xmm2
4367 ; SSE-NEXT: por %xmm2, %xmm4
4368 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4369 ; SSE-NEXT: movdqa %xmm10, %xmm2
4370 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4371 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4372 ; SSE-NEXT: pand %xmm10, %xmm3
4373 ; SSE-NEXT: por %xmm2, %xmm3
4374 ; SSE-NEXT: movdqa %xmm3, %xmm2
4375 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4376 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4377 ; SSE-NEXT: movdqa %xmm3, %xmm4
4378 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0]
4379 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3]
4380 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
4381 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
4382 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
4383 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
4384 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
4385 ; SSE-NEXT: psllq $48, %xmm3
4386 ; SSE-NEXT: packuswb %xmm2, %xmm3
4387 ; SSE-NEXT: movdqa %xmm11, %xmm4
4388 ; SSE-NEXT: pandn %xmm3, %xmm4
4389 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4390 ; SSE-NEXT: movdqa %xmm6, %xmm3
4391 ; SSE-NEXT: pand %xmm0, %xmm3
4392 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4393 ; SSE-NEXT: movdqa %xmm3, %xmm7
4394 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
4395 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0]
4396 ; SSE-NEXT: movdqa %xmm2, %xmm8
4397 ; SSE-NEXT: pandn %xmm7, %xmm8
4398 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4399 ; SSE-NEXT: pand %xmm2, %xmm3
4400 ; SSE-NEXT: por %xmm8, %xmm3
4401 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
4402 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
4403 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
4404 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
4405 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7]
4406 ; SSE-NEXT: packuswb %xmm3, %xmm3
4407 ; SSE-NEXT: pand %xmm11, %xmm3
4408 ; SSE-NEXT: por %xmm4, %xmm3
4409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4410 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4411 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0]
4412 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2]
4413 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7]
4414 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
4415 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
4416 ; SSE-NEXT: packuswb %xmm4, %xmm4
4417 ; SSE-NEXT: movdqa %xmm14, %xmm7
4418 ; SSE-NEXT: pandn %xmm4, %xmm7
4419 ; SSE-NEXT: pand %xmm14, %xmm3
4420 ; SSE-NEXT: por %xmm3, %xmm7
4421 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4422 ; SSE-NEXT: movdqa %xmm10, %xmm3
4423 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4424 ; SSE-NEXT: movdqa %xmm15, %xmm4
4425 ; SSE-NEXT: pand %xmm10, %xmm4
4426 ; SSE-NEXT: movdqa %xmm10, %xmm5
4427 ; SSE-NEXT: por %xmm3, %xmm4
4428 ; SSE-NEXT: movdqa %xmm4, %xmm3
4429 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
4430 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4431 ; SSE-NEXT: movdqa %xmm4, %xmm7
4432 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0]
4433 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3]
4434 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
4435 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7]
4436 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
4437 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
4438 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
4439 ; SSE-NEXT: psllq $48, %xmm4
4440 ; SSE-NEXT: packuswb %xmm3, %xmm4
4441 ; SSE-NEXT: movdqa %xmm11, %xmm3
4442 ; SSE-NEXT: pandn %xmm4, %xmm3
4443 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4444 ; SSE-NEXT: movdqa %xmm10, %xmm4
4445 ; SSE-NEXT: movdqa %xmm0, %xmm8
4446 ; SSE-NEXT: pand %xmm0, %xmm4
4447 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4448 ; SSE-NEXT: movdqa %xmm4, %xmm0
4449 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
4450 ; SSE-NEXT: movdqa %xmm2, %xmm7
4451 ; SSE-NEXT: pandn %xmm0, %xmm7
4452 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4453 ; SSE-NEXT: pand %xmm2, %xmm4
4454 ; SSE-NEXT: por %xmm7, %xmm4
4455 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3]
4456 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4457 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
4458 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
4459 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
4460 ; SSE-NEXT: packuswb %xmm0, %xmm0
4461 ; SSE-NEXT: pand %xmm11, %xmm0
4462 ; SSE-NEXT: por %xmm3, %xmm0
4463 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4464 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4465 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0]
4466 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
4467 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7]
4468 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
4469 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
4470 ; SSE-NEXT: packuswb %xmm3, %xmm3
4471 ; SSE-NEXT: movdqa %xmm14, %xmm4
4472 ; SSE-NEXT: pandn %xmm3, %xmm4
4473 ; SSE-NEXT: pand %xmm14, %xmm0
4474 ; SSE-NEXT: por %xmm0, %xmm4
4475 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4476 ; SSE-NEXT: movdqa %xmm5, %xmm0
4477 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4478 ; SSE-NEXT: movdqa %xmm13, %xmm3
4479 ; SSE-NEXT: pand %xmm5, %xmm3
4480 ; SSE-NEXT: por %xmm0, %xmm3
4481 ; SSE-NEXT: movdqa %xmm3, %xmm0
4482 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
4483 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4484 ; SSE-NEXT: movdqa %xmm3, %xmm4
4485 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
4486 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
4487 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
4488 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
4489 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
4490 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
4491 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4492 ; SSE-NEXT: psllq $48, %xmm3
4493 ; SSE-NEXT: packuswb %xmm0, %xmm3
4494 ; SSE-NEXT: movdqa %xmm11, %xmm0
4495 ; SSE-NEXT: pandn %xmm3, %xmm0
4496 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
4497 ; SSE-NEXT: pand %xmm8, %xmm3
4498 ; SSE-NEXT: movdqa %xmm8, %xmm7
4499 ; SSE-NEXT: por %xmm1, %xmm3
4500 ; SSE-NEXT: movdqa %xmm3, %xmm1
4501 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4502 ; SSE-NEXT: movdqa %xmm2, %xmm4
4503 ; SSE-NEXT: pandn %xmm1, %xmm4
4504 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4505 ; SSE-NEXT: pand %xmm2, %xmm3
4506 ; SSE-NEXT: por %xmm4, %xmm3
4507 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
4508 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4509 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4510 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4511 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
4512 ; SSE-NEXT: packuswb %xmm1, %xmm1
4513 ; SSE-NEXT: pand %xmm11, %xmm1
4514 ; SSE-NEXT: por %xmm0, %xmm1
4515 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4516 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4517 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
4518 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
4519 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7]
4520 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4521 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
4522 ; SSE-NEXT: packuswb %xmm0, %xmm0
4523 ; SSE-NEXT: movdqa %xmm14, %xmm3
4524 ; SSE-NEXT: pandn %xmm0, %xmm3
4525 ; SSE-NEXT: pand %xmm14, %xmm1
4526 ; SSE-NEXT: por %xmm1, %xmm3
4527 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4528 ; SSE-NEXT: movdqa %xmm5, %xmm1
4529 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4530 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4531 ; SSE-NEXT: pand %xmm5, %xmm0
4532 ; SSE-NEXT: movdqa %xmm5, %xmm8
4533 ; SSE-NEXT: por %xmm1, %xmm0
4534 ; SSE-NEXT: movdqa %xmm0, %xmm1
4535 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4536 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4537 ; SSE-NEXT: movdqa %xmm0, %xmm3
4538 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
4539 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
4540 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
4541 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7]
4542 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1]
4543 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4544 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4545 ; SSE-NEXT: psllq $48, %xmm0
4546 ; SSE-NEXT: packuswb %xmm1, %xmm0
4547 ; SSE-NEXT: movdqa %xmm7, %xmm1
4548 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4549 ; SSE-NEXT: pandn %xmm5, %xmm1
4550 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4551 ; SSE-NEXT: pand %xmm7, %xmm3
4552 ; SSE-NEXT: por %xmm1, %xmm3
4553 ; SSE-NEXT: movdqa %xmm3, %xmm1
4554 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4555 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
4556 ; SSE-NEXT: pand %xmm2, %xmm3
4557 ; SSE-NEXT: pandn %xmm1, %xmm2
4558 ; SSE-NEXT: por %xmm3, %xmm2
4559 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
4560 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4561 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
4562 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
4563 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
4564 ; SSE-NEXT: packuswb %xmm1, %xmm1
4565 ; SSE-NEXT: pand %xmm11, %xmm1
4566 ; SSE-NEXT: pandn %xmm0, %xmm11
4567 ; SSE-NEXT: por %xmm11, %xmm1
4568 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4569 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
4570 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
4571 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7]
4572 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4573 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
4574 ; SSE-NEXT: packuswb %xmm0, %xmm0
4575 ; SSE-NEXT: movdqa %xmm14, %xmm2
4576 ; SSE-NEXT: pandn %xmm0, %xmm2
4577 ; SSE-NEXT: pand %xmm14, %xmm1
4578 ; SSE-NEXT: por %xmm1, %xmm2
4579 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4580 ; SSE-NEXT: movdqa %xmm6, %xmm1
4581 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
4582 ; SSE-NEXT: pand %xmm11, %xmm1
4583 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4584 ; SSE-NEXT: movdqa %xmm1, %xmm2
4585 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4586 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
4587 ; SSE-NEXT: movdqa %xmm6, %xmm3
4588 ; SSE-NEXT: pandn %xmm2, %xmm3
4589 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4590 ; SSE-NEXT: pand %xmm6, %xmm1
4591 ; SSE-NEXT: por %xmm3, %xmm1
4592 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4593 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4594 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4595 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4596 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4597 ; SSE-NEXT: packuswb %xmm1, %xmm1
4598 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535]
4599 ; SSE-NEXT: movdqa %xmm12, %xmm2
4600 ; SSE-NEXT: pandn %xmm1, %xmm2
4601 ; SSE-NEXT: movdqa %xmm8, %xmm1
4602 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4603 ; SSE-NEXT: movdqa %xmm7, %xmm0
4604 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4605 ; SSE-NEXT: pandn %xmm4, %xmm0
4606 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4607 ; SSE-NEXT: movdqa %xmm11, %xmm3
4608 ; SSE-NEXT: pandn %xmm4, %xmm3
4609 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4610 ; SSE-NEXT: pand %xmm8, %xmm4
4611 ; SSE-NEXT: por %xmm1, %xmm4
4612 ; SSE-NEXT: movdqa %xmm4, %xmm1
4613 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4614 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4615 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
4616 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
4617 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
4618 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4619 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4620 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4621 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4622 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
4623 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
4624 ; SSE-NEXT: packuswb %xmm1, %xmm4
4625 ; SSE-NEXT: pand %xmm12, %xmm4
4626 ; SSE-NEXT: por %xmm2, %xmm4
4627 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4628 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
4629 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4630 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
4631 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4632 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4633 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4634 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4635 ; SSE-NEXT: packuswb %xmm1, %xmm1
4636 ; SSE-NEXT: movdqa %xmm14, %xmm3
4637 ; SSE-NEXT: pandn %xmm1, %xmm3
4638 ; SSE-NEXT: pand %xmm14, %xmm4
4639 ; SSE-NEXT: por %xmm4, %xmm3
4640 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4641 ; SSE-NEXT: pand %xmm11, %xmm10
4642 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4643 ; SSE-NEXT: movdqa %xmm10, %xmm2
4644 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4645 ; SSE-NEXT: movdqa %xmm6, %xmm4
4646 ; SSE-NEXT: pandn %xmm2, %xmm4
4647 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
4648 ; SSE-NEXT: pand %xmm6, %xmm10
4649 ; SSE-NEXT: por %xmm4, %xmm10
4650 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7]
4651 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4652 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4653 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4654 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4655 ; SSE-NEXT: packuswb %xmm1, %xmm1
4656 ; SSE-NEXT: movdqa %xmm12, %xmm2
4657 ; SSE-NEXT: pandn %xmm1, %xmm2
4658 ; SSE-NEXT: movdqa %xmm8, %xmm1
4659 ; SSE-NEXT: pandn %xmm15, %xmm1
4660 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4661 ; SSE-NEXT: movdqa %xmm10, %xmm0
4662 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4663 ; SSE-NEXT: pandn %xmm4, %xmm0
4664 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4665 ; SSE-NEXT: movdqa %xmm11, %xmm3
4666 ; SSE-NEXT: pandn %xmm4, %xmm3
4667 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4668 ; SSE-NEXT: pand %xmm8, %xmm4
4669 ; SSE-NEXT: por %xmm1, %xmm4
4670 ; SSE-NEXT: movdqa %xmm4, %xmm1
4671 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4672 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4673 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
4674 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
4675 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
4676 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4677 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4678 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4679 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4680 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
4681 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
4682 ; SSE-NEXT: packuswb %xmm1, %xmm4
4683 ; SSE-NEXT: pand %xmm12, %xmm4
4684 ; SSE-NEXT: por %xmm2, %xmm4
4685 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4686 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
4687 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4688 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
4689 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4690 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4691 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4692 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
4693 ; SSE-NEXT: packuswb %xmm1, %xmm1
4694 ; SSE-NEXT: movdqa %xmm14, %xmm2
4695 ; SSE-NEXT: pandn %xmm1, %xmm2
4696 ; SSE-NEXT: pand %xmm14, %xmm4
4697 ; SSE-NEXT: por %xmm4, %xmm2
4698 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4699 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
4700 ; SSE-NEXT: pand %xmm11, %xmm1
4701 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4702 ; SSE-NEXT: movdqa %xmm1, %xmm2
4703 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4704 ; SSE-NEXT: movdqa %xmm6, %xmm4
4705 ; SSE-NEXT: pandn %xmm2, %xmm4
4706 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4707 ; SSE-NEXT: pand %xmm6, %xmm1
4708 ; SSE-NEXT: por %xmm4, %xmm1
4709 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4710 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4711 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
4712 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
4713 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4714 ; SSE-NEXT: packuswb %xmm1, %xmm1
4715 ; SSE-NEXT: movdqa %xmm12, %xmm2
4716 ; SSE-NEXT: pandn %xmm1, %xmm2
4717 ; SSE-NEXT: movdqa %xmm8, %xmm4
4718 ; SSE-NEXT: pandn %xmm13, %xmm4
4719 ; SSE-NEXT: movdqa %xmm10, %xmm0
4720 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4721 ; SSE-NEXT: pandn %xmm7, %xmm0
4722 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4723 ; SSE-NEXT: movdqa %xmm11, %xmm1
4724 ; SSE-NEXT: pandn %xmm7, %xmm1
4725 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4726 ; SSE-NEXT: pand %xmm8, %xmm7
4727 ; SSE-NEXT: movdqa %xmm8, %xmm10
4728 ; SSE-NEXT: por %xmm4, %xmm7
4729 ; SSE-NEXT: movdqa %xmm7, %xmm4
4730 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
4731 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
4732 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0]
4733 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0]
4734 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2]
4735 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
4736 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4737 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
4738 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
4739 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
4740 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5]
4741 ; SSE-NEXT: packuswb %xmm4, %xmm7
4742 ; SSE-NEXT: pand %xmm12, %xmm7
4743 ; SSE-NEXT: por %xmm2, %xmm7
4744 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4745 ; SSE-NEXT: # xmm2 = mem[1,1,1,1]
4746 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4747 ; SSE-NEXT: # xmm4 = mem[0,2,2,3]
4748 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4749 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7]
4750 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4751 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
4752 ; SSE-NEXT: packuswb %xmm2, %xmm2
4753 ; SSE-NEXT: movdqa %xmm14, %xmm1
4754 ; SSE-NEXT: pandn %xmm2, %xmm1
4755 ; SSE-NEXT: pand %xmm14, %xmm7
4756 ; SSE-NEXT: por %xmm7, %xmm1
4757 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4758 ; SSE-NEXT: movdqa %xmm11, %xmm8
4759 ; SSE-NEXT: movdqa %xmm11, %xmm2
4760 ; SSE-NEXT: pandn %xmm5, %xmm2
4761 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4762 ; SSE-NEXT: pand %xmm11, %xmm4
4763 ; SSE-NEXT: por %xmm2, %xmm4
4764 ; SSE-NEXT: movdqa %xmm4, %xmm2
4765 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
4766 ; SSE-NEXT: movdqa %xmm6, %xmm7
4767 ; SSE-NEXT: pandn %xmm2, %xmm7
4768 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
4769 ; SSE-NEXT: pand %xmm6, %xmm4
4770 ; SSE-NEXT: por %xmm7, %xmm4
4771 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
4772 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4773 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
4774 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
4775 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7]
4776 ; SSE-NEXT: packuswb %xmm4, %xmm4
4777 ; SSE-NEXT: movdqa %xmm12, %xmm3
4778 ; SSE-NEXT: pandn %xmm4, %xmm3
4779 ; SSE-NEXT: movdqa %xmm10, %xmm7
4780 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4781 ; SSE-NEXT: pandn %xmm5, %xmm7
4782 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4783 ; SSE-NEXT: movdqa %xmm0, %xmm14
4784 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
4785 ; SSE-NEXT: pand %xmm1, %xmm14
4786 ; SSE-NEXT: movdqa %xmm15, %xmm11
4787 ; SSE-NEXT: pand %xmm1, %xmm11
4788 ; SSE-NEXT: movdqa %xmm13, %xmm4
4789 ; SSE-NEXT: pand %xmm1, %xmm4
4790 ; SSE-NEXT: movdqa %xmm5, %xmm2
4791 ; SSE-NEXT: pand %xmm1, %xmm2
4792 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4793 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4794 ; SSE-NEXT: pandn %xmm2, %xmm1
4795 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4796 ; SSE-NEXT: pand %xmm8, %xmm0
4797 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4798 ; SSE-NEXT: pand %xmm8, %xmm15
4799 ; SSE-NEXT: pand %xmm8, %xmm13
4800 ; SSE-NEXT: pand %xmm8, %xmm5
4801 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4802 ; SSE-NEXT: movdqa %xmm2, %xmm0
4803 ; SSE-NEXT: pandn %xmm2, %xmm8
4804 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4805 ; SSE-NEXT: pand %xmm10, %xmm0
4806 ; SSE-NEXT: por %xmm7, %xmm0
4807 ; SSE-NEXT: movdqa %xmm0, %xmm7
4808 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
4809 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4810 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
4811 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0]
4812 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
4813 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7]
4814 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
4815 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4816 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
4817 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
4818 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
4819 ; SSE-NEXT: packuswb %xmm0, %xmm1
4820 ; SSE-NEXT: pand %xmm12, %xmm1
4821 ; SSE-NEXT: por %xmm3, %xmm1
4822 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4823 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
4824 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4825 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4826 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4827 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
4828 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
4829 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
4830 ; SSE-NEXT: packuswb %xmm0, %xmm0
4831 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4832 ; SSE-NEXT: movdqa %xmm10, %xmm2
4833 ; SSE-NEXT: pandn %xmm0, %xmm2
4834 ; SSE-NEXT: pand %xmm10, %xmm1
4835 ; SSE-NEXT: por %xmm1, %xmm2
4836 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4837 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4838 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
4839 ; SSE-NEXT: pand %xmm3, %xmm0
4840 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4841 ; SSE-NEXT: movdqa %xmm0, %xmm1
4842 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4843 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4844 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4845 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4846 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4847 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4848 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4849 ; SSE-NEXT: packuswb %xmm0, %xmm0
4850 ; SSE-NEXT: movdqa %xmm12, %xmm1
4851 ; SSE-NEXT: pandn %xmm0, %xmm1
4852 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4853 ; SSE-NEXT: movdqa %xmm14, %xmm0
4854 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4855 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
4856 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
4857 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7]
4858 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4859 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4860 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4861 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4862 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4863 ; SSE-NEXT: packuswb %xmm2, %xmm0
4864 ; SSE-NEXT: pand %xmm12, %xmm0
4865 ; SSE-NEXT: por %xmm1, %xmm0
4866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4867 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4868 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4869 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4870 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4871 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4872 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4873 ; SSE-NEXT: packuswb %xmm1, %xmm1
4874 ; SSE-NEXT: movdqa %xmm10, %xmm9
4875 ; SSE-NEXT: movdqa %xmm10, %xmm14
4876 ; SSE-NEXT: pandn %xmm1, %xmm14
4877 ; SSE-NEXT: pand %xmm10, %xmm0
4878 ; SSE-NEXT: por %xmm0, %xmm14
4879 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4880 ; SSE-NEXT: pand %xmm3, %xmm0
4881 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4882 ; SSE-NEXT: movdqa %xmm0, %xmm1
4883 ; SSE-NEXT: pxor %xmm2, %xmm2
4884 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
4885 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
4886 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4887 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4888 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4889 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4890 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4891 ; SSE-NEXT: packuswb %xmm0, %xmm0
4892 ; SSE-NEXT: movdqa %xmm12, %xmm1
4893 ; SSE-NEXT: pandn %xmm0, %xmm1
4894 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4895 ; SSE-NEXT: movdqa %xmm11, %xmm0
4896 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
4897 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
4898 ; SSE-NEXT: pxor %xmm10, %xmm10
4899 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0]
4900 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7]
4901 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4902 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4903 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4904 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4905 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4906 ; SSE-NEXT: packuswb %xmm2, %xmm0
4907 ; SSE-NEXT: pand %xmm12, %xmm0
4908 ; SSE-NEXT: por %xmm1, %xmm0
4909 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4910 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4911 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4912 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4913 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4914 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4915 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4916 ; SSE-NEXT: packuswb %xmm1, %xmm1
4917 ; SSE-NEXT: movdqa %xmm9, %xmm11
4918 ; SSE-NEXT: pandn %xmm1, %xmm11
4919 ; SSE-NEXT: pand %xmm9, %xmm0
4920 ; SSE-NEXT: por %xmm0, %xmm11
4921 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
4922 ; SSE-NEXT: pand %xmm3, %xmm0
4923 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4924 ; SSE-NEXT: movdqa %xmm0, %xmm1
4925 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
4926 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4927 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
4928 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
4929 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
4930 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
4931 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
4932 ; SSE-NEXT: packuswb %xmm0, %xmm0
4933 ; SSE-NEXT: movdqa %xmm12, %xmm1
4934 ; SSE-NEXT: pandn %xmm0, %xmm1
4935 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4936 ; SSE-NEXT: movdqa %xmm4, %xmm0
4937 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4938 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
4939 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0]
4940 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
4941 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4942 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4943 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4944 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4945 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4946 ; SSE-NEXT: packuswb %xmm2, %xmm0
4947 ; SSE-NEXT: pand %xmm12, %xmm0
4948 ; SSE-NEXT: por %xmm1, %xmm0
4949 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4950 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4951 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
4952 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
4953 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
4954 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4955 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
4956 ; SSE-NEXT: packuswb %xmm1, %xmm2
4957 ; SSE-NEXT: movdqa %xmm9, %xmm10
4958 ; SSE-NEXT: pandn %xmm2, %xmm10
4959 ; SSE-NEXT: pand %xmm9, %xmm0
4960 ; SSE-NEXT: por %xmm0, %xmm10
4961 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4962 ; SSE-NEXT: movdqa %xmm3, %xmm2
4963 ; SSE-NEXT: pand %xmm3, %xmm0
4964 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4965 ; SSE-NEXT: por %xmm0, %xmm2
4966 ; SSE-NEXT: movdqa %xmm2, %xmm0
4967 ; SSE-NEXT: pxor %xmm1, %xmm1
4968 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4969 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4970 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
4971 ; SSE-NEXT: movaps %xmm2, %xmm4
4972 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4973 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4974 ; SSE-NEXT: movdqa %xmm2, %xmm0
4975 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4976 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4977 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
4978 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
4979 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
4980 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4981 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
4982 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4983 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
4984 ; SSE-NEXT: packuswb %xmm2, %xmm0
4985 ; SSE-NEXT: pand %xmm12, %xmm0
4986 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5]
4987 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
4988 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
4989 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
4990 ; SSE-NEXT: packuswb %xmm2, %xmm2
4991 ; SSE-NEXT: pandn %xmm2, %xmm12
4992 ; SSE-NEXT: por %xmm12, %xmm0
4993 ; SSE-NEXT: movdqa %xmm8, %xmm3
4994 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0]
4995 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
4996 ; SSE-NEXT: pand %xmm9, %xmm0
4997 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
4998 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
4999 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
5000 ; SSE-NEXT: packuswb %xmm2, %xmm2
5001 ; SSE-NEXT: pandn %xmm2, %xmm9
5002 ; SSE-NEXT: por %xmm0, %xmm9
5003 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5004 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5005 ; SSE-NEXT: movdqa %xmm3, %xmm0
5006 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5007 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5008 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
5009 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2]
5010 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
5011 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
5012 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5013 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5014 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5015 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5016 ; SSE-NEXT: packuswb %xmm0, %xmm2
5017 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
5018 ; SSE-NEXT: movdqa %xmm4, %xmm3
5019 ; SSE-NEXT: pandn %xmm2, %xmm3
5020 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5021 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
5022 ; SSE-NEXT: pand %xmm12, %xmm8
5023 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5024 ; SSE-NEXT: movdqa %xmm8, %xmm2
5025 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5026 ; SSE-NEXT: movdqa %xmm6, %xmm7
5027 ; SSE-NEXT: pandn %xmm2, %xmm7
5028 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
5029 ; SSE-NEXT: pand %xmm6, %xmm8
5030 ; SSE-NEXT: por %xmm7, %xmm8
5031 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7]
5032 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
5033 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
5034 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
5035 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
5036 ; SSE-NEXT: packuswb %xmm2, %xmm2
5037 ; SSE-NEXT: pand %xmm4, %xmm2
5038 ; SSE-NEXT: por %xmm3, %xmm2
5039 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5040 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
5041 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5042 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
5043 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5044 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5045 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5046 ; SSE-NEXT: packuswb %xmm0, %xmm7
5047 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1]
5048 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5049 ; SSE-NEXT: movdqa %xmm15, %xmm0
5050 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5051 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
5052 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3]
5053 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2]
5054 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1]
5055 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7]
5056 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5057 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5058 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5059 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5060 ; SSE-NEXT: packuswb %xmm0, %xmm3
5061 ; SSE-NEXT: movdqa %xmm4, %xmm7
5062 ; SSE-NEXT: pandn %xmm3, %xmm7
5063 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5064 ; SSE-NEXT: pand %xmm12, %xmm15
5065 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5066 ; SSE-NEXT: movdqa %xmm15, %xmm3
5067 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5068 ; SSE-NEXT: movdqa %xmm6, %xmm8
5069 ; SSE-NEXT: pandn %xmm3, %xmm8
5070 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
5071 ; SSE-NEXT: pand %xmm6, %xmm15
5072 ; SSE-NEXT: por %xmm8, %xmm15
5073 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7]
5074 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
5075 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
5076 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
5077 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7]
5078 ; SSE-NEXT: packuswb %xmm8, %xmm8
5079 ; SSE-NEXT: pand %xmm4, %xmm8
5080 ; SSE-NEXT: por %xmm7, %xmm8
5081 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5082 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
5083 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5084 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
5085 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5086 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5087 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5088 ; SSE-NEXT: packuswb %xmm0, %xmm7
5089 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1]
5090 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5091 ; SSE-NEXT: movdqa %xmm13, %xmm0
5092 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5093 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
5094 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3]
5095 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2]
5096 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1]
5097 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
5098 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
5099 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
5100 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
5101 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5102 ; SSE-NEXT: packuswb %xmm0, %xmm3
5103 ; SSE-NEXT: movdqa %xmm4, %xmm7
5104 ; SSE-NEXT: pandn %xmm3, %xmm7
5105 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload
5106 ; SSE-NEXT: pand %xmm12, %xmm13
5107 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5108 ; SSE-NEXT: movdqa %xmm13, %xmm3
5109 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5110 ; SSE-NEXT: movdqa %xmm6, %xmm5
5111 ; SSE-NEXT: pandn %xmm3, %xmm5
5112 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
5113 ; SSE-NEXT: pand %xmm6, %xmm13
5114 ; SSE-NEXT: por %xmm5, %xmm13
5115 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7]
5116 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
5117 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
5118 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
5119 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7]
5120 ; SSE-NEXT: packuswb %xmm5, %xmm5
5121 ; SSE-NEXT: pand %xmm4, %xmm5
5122 ; SSE-NEXT: por %xmm7, %xmm5
5123 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5124 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
5125 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5126 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
5127 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5128 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
5129 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
5130 ; SSE-NEXT: packuswb %xmm0, %xmm7
5131 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1]
5132 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5133 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5134 ; SSE-NEXT: movdqa %xmm7, %xmm0
5135 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
5136 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
5137 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3]
5138 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2]
5139 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5140 ; SSE-NEXT: pand %xmm12, %xmm0
5141 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5142 ; SSE-NEXT: por %xmm0, %xmm12
5143 ; SSE-NEXT: movdqa %xmm12, %xmm0
5144 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5145 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
5146 ; SSE-NEXT: pand %xmm6, %xmm12
5147 ; SSE-NEXT: pandn %xmm0, %xmm6
5148 ; SSE-NEXT: por %xmm12, %xmm6
5149 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7]
5150 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
5151 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
5152 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
5153 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5154 ; SSE-NEXT: packuswb %xmm0, %xmm0
5155 ; SSE-NEXT: pand %xmm4, %xmm0
5156 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1]
5157 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
5158 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
5159 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
5160 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7]
5161 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
5162 ; SSE-NEXT: packuswb %xmm6, %xmm3
5163 ; SSE-NEXT: pandn %xmm3, %xmm4
5164 ; SSE-NEXT: por %xmm4, %xmm0
5165 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5166 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
5167 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5168 ; SSE-NEXT: # xmm4 = mem[0,2,2,3]
5169 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
5170 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
5171 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5172 ; SSE-NEXT: packuswb %xmm6, %xmm4
5173 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1]
5174 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5175 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
5176 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5177 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
5178 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5179 ; SSE-NEXT: movaps %xmm3, (%rsi)
5180 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5181 ; SSE-NEXT: movaps %xmm3, 32(%rsi)
5182 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5183 ; SSE-NEXT: movaps %xmm3, 16(%rdx)
5184 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5185 ; SSE-NEXT: movaps %xmm3, 48(%rdx)
5186 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5187 ; SSE-NEXT: movaps %xmm3, (%rdx)
5188 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5189 ; SSE-NEXT: movaps %xmm3, 32(%rdx)
5190 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5191 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
5192 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5193 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
5194 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5195 ; SSE-NEXT: movaps %xmm1, (%rcx)
5196 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5197 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
5198 ; SSE-NEXT: movdqa %xmm9, 16(%r8)
5199 ; SSE-NEXT: movdqa %xmm10, 48(%r8)
5200 ; SSE-NEXT: movdqa %xmm11, (%r8)
5201 ; SSE-NEXT: movdqa %xmm14, 32(%r8)
5202 ; SSE-NEXT: movaps %xmm0, 16(%r9)
5203 ; SSE-NEXT: movaps %xmm5, 48(%r9)
5204 ; SSE-NEXT: movaps %xmm8, (%r9)
5205 ; SSE-NEXT: movaps %xmm2, 32(%r9)
5206 ; SSE-NEXT: addq $552, %rsp # imm = 0x228
5209 ; AVX-LABEL: load_i8_stride5_vf64:
5211 ; AVX-NEXT: subq $488, %rsp # imm = 0x1E8
5212 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0]
5213 ; AVX-NEXT: vmovdqa (%rdi), %xmm4
5214 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
5215 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm7
5216 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm5
5217 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm0
5218 ; AVX-NEXT: vmovdqa %xmm1, %xmm11
5219 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5220 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
5221 ; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm1
5222 ; AVX-NEXT: vmovdqa %xmm4, %xmm8
5223 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5224 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5225 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128]
5226 ; AVX-NEXT: # xmm4 = mem[0,0]
5227 ; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm0
5228 ; AVX-NEXT: vmovdqa %xmm5, %xmm9
5229 ; AVX-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
5230 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3]
5231 ; AVX-NEXT: # xmm5 = mem[0,0]
5232 ; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm6
5233 ; AVX-NEXT: vmovdqa %xmm7, %xmm12
5234 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5235 ; AVX-NEXT: vpor %xmm0, %xmm6, %xmm6
5236 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
5237 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
5238 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5239 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
5240 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2
5241 ; AVX-NEXT: vmovdqa %xmm1, %xmm14
5242 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5243 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
5244 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
5245 ; AVX-NEXT: vmovdqa %xmm1, %xmm13
5246 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5247 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5248 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
5249 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm3
5250 ; AVX-NEXT: vmovdqa %xmm1, %xmm10
5251 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5252 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm1
5253 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm4
5254 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5255 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
5256 ; AVX-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
5257 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5258 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
5259 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm4
5260 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
5261 ; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm6
5262 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
5263 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
5264 ; AVX-NEXT: # xmm7 = mem[0,0]
5265 ; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm6
5266 ; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4]
5267 ; AVX-NEXT: # xmm8 = mem[0,0]
5268 ; AVX-NEXT: vpshufb %xmm8, %xmm12, %xmm9
5269 ; AVX-NEXT: vpor %xmm6, %xmm9, %xmm6
5270 ; AVX-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2
5271 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5272 ; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3
5273 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm4
5274 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
5275 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
5276 ; AVX-NEXT: # xmm11 = mem[0,0]
5277 ; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4
5278 ; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm5
5279 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
5280 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
5281 ; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0
5282 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5283 ; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm0
5284 ; AVX-NEXT: vmovdqa %xmm1, %xmm8
5285 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
5286 ; AVX-NEXT: # xmm7 = mem[0,0]
5287 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm13
5288 ; AVX-NEXT: vpshufb %xmm7, %xmm13, %xmm3
5289 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
5290 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3]
5291 ; AVX-NEXT: # xmm5 = mem[0,0]
5292 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm1
5293 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5294 ; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm3
5295 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128]
5296 ; AVX-NEXT: # xmm6 = mem[0,0]
5297 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
5298 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5299 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm12
5300 ; AVX-NEXT: vpor %xmm3, %xmm12, %xmm3
5301 ; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4
5302 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7]
5303 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u]
5304 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm14
5305 ; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm15
5306 ; AVX-NEXT: vpor %xmm15, %xmm12, %xmm12
5307 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7]
5308 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
5309 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
5310 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11]
5311 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
5312 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5313 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm9
5314 ; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9
5315 ; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0
5316 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0
5317 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5318 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
5319 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5320 ; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm0
5321 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
5322 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5323 ; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm9
5324 ; AVX-NEXT: vpor %xmm0, %xmm9, %xmm0
5325 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm10
5326 ; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm5
5327 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5328 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm9
5329 ; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm6
5330 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
5331 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
5332 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm1
5333 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5334 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
5335 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
5336 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
5337 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
5338 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm5
5339 ; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2
5340 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5341 ; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2
5342 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
5343 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
5344 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
5346 ; AVX-NEXT: # xmm2 = mem[0,0]
5347 ; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm0
5348 ; AVX-NEXT: vmovdqa %xmm8, %xmm11
5349 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5350 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
5351 ; AVX-NEXT: # xmm3 = mem[0,0]
5352 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5353 ; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm4
5354 ; AVX-NEXT: vpor %xmm0, %xmm4, %xmm4
5355 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5356 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u]
5357 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5358 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u]
5359 ; AVX-NEXT: vpor %xmm0, %xmm8, %xmm8
5360 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u]
5361 ; AVX-NEXT: vmovdqa %xmm14, %xmm6
5362 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5363 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u]
5364 ; AVX-NEXT: vpor %xmm14, %xmm8, %xmm8
5365 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
5366 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
5367 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12]
5368 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5369 ; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm15
5370 ; AVX-NEXT: vandnps %ymm15, %ymm12, %ymm15
5371 ; AVX-NEXT: vorps %ymm15, %ymm8, %ymm8
5372 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4
5373 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5374 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5375 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm2
5376 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5377 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm3
5378 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
5379 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u]
5380 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u]
5381 ; AVX-NEXT: vmovdqa %xmm9, %xmm8
5382 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5383 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
5384 ; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
5385 ; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
5386 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5387 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u]
5388 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
5389 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
5390 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
5391 ; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm3
5392 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
5393 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
5394 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
5395 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5396 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
5397 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
5398 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
5399 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u]
5400 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5]
5401 ; AVX-NEXT: # xmm4 = mem[0,0]
5402 ; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm5
5403 ; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3
5404 ; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
5405 ; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u]
5406 ; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14
5407 ; AVX-NEXT: vpor %xmm3, %xmm14, %xmm3
5408 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7]
5409 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5410 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
5411 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128]
5412 ; AVX-NEXT: # xmm3 = mem[0,0]
5413 ; AVX-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
5414 ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm12
5415 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
5416 ; AVX-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
5417 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5418 ; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm0
5419 ; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
5420 ; AVX-NEXT: # xmm9 = mem[0,0]
5421 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5422 ; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm13
5423 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7]
5424 ; AVX-NEXT: vpor %xmm0, %xmm12, %xmm0
5425 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
5426 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
5427 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5428 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
5429 ; AVX-NEXT: vandnps %ymm13, %ymm12, %ymm13
5430 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0
5431 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
5432 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5433 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5434 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
5435 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5436 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
5437 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
5438 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u]
5439 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5440 ; AVX-NEXT: vpshufb %xmm4, %xmm13, %xmm4
5441 ; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1
5442 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u]
5443 ; AVX-NEXT: vpshufb %xmm11, %xmm10, %xmm4
5444 ; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1
5445 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
5446 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5447 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u]
5448 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5449 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm3
5450 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7]
5451 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5452 ; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3
5453 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5454 ; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm2
5455 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
5456 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
5457 ; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
5458 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5459 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
5460 ; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2
5461 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
5462 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
5463 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5464 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
5465 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u]
5466 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
5467 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
5468 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
5469 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7]
5470 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm3
5471 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5472 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u]
5473 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5474 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
5475 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
5476 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u]
5477 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u]
5478 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5479 ; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm1
5480 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
5481 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14]
5482 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5483 ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm4
5484 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4
5485 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
5486 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3
5487 ; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4
5488 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
5489 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5490 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
5491 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5492 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
5493 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5
5494 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm7
5495 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm0 = [18446744073709551615,255]
5496 ; AVX-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5
5497 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
5498 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5499 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u]
5500 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u]
5501 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7]
5502 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
5503 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u]
5504 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7]
5505 ; AVX-NEXT: vpor %xmm3, %xmm5, %xmm3
5506 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u]
5507 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5508 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5509 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5
5510 ; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
5511 ; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm5
5512 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5513 ; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2
5514 ; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2
5515 ; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1
5516 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
5517 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm2
5518 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm0
5519 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
5520 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5521 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14]
5522 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5523 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
5524 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
5525 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
5526 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm8 = [18446744073709551615,255]
5527 ; AVX-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1
5528 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
5529 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5530 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128]
5531 ; AVX-NEXT: # xmm0 = mem[0,0]
5532 ; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm2
5533 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15]
5534 ; AVX-NEXT: # xmm1 = mem[0,0]
5535 ; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm3
5536 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
5537 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5538 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u]
5539 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128]
5540 ; AVX-NEXT: # xmm3 = mem[0,0]
5541 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5542 ; AVX-NEXT: vpshufb %xmm3, %xmm6, %xmm6
5543 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
5544 ; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4
5545 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u]
5546 ; AVX-NEXT: vpor %xmm7, %xmm4, %xmm4
5547 ; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
5548 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
5549 ; AVX-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
5550 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5551 ; AVX-NEXT: vpshufb %xmm7, %xmm12, %xmm12
5552 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
5553 ; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7]
5554 ; AVX-NEXT: # xmm12 = mem[0,0]
5555 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5556 ; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm13
5557 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
5558 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5559 ; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15
5560 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
5561 ; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10
5562 ; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2
5563 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
5564 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5565 ; AVX-NEXT: vpshufb %xmm13, %xmm15, %xmm15
5566 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4
5567 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
5568 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
5569 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0
5570 ; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1
5571 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
5572 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5573 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u]
5574 ; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm3
5575 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
5576 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u]
5577 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u]
5578 ; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
5579 ; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0
5580 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5581 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
5582 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5583 ; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4
5584 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7]
5585 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5586 ; AVX-NEXT: vpshufb %xmm12, %xmm4, %xmm4
5587 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5588 ; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
5589 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
5590 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
5591 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5592 ; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm4
5593 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
5594 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
5595 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
5596 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5597 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
5598 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5599 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
5600 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5601 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
5602 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5603 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
5604 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5605 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
5606 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5607 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
5608 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5609 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
5610 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5611 ; AVX-NEXT: vmovaps %ymm1, (%r8)
5612 ; AVX-NEXT: vmovaps %ymm0, 32(%r9)
5613 ; AVX-NEXT: vmovaps %ymm2, (%r9)
5614 ; AVX-NEXT: addq $488, %rsp # imm = 0x1E8
5615 ; AVX-NEXT: vzeroupper
5618 ; AVX2-LABEL: load_i8_stride5_vf64:
5620 ; AVX2-NEXT: subq $136, %rsp
5621 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
5622 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4
5623 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10
5624 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm9
5625 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5626 ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
5627 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5628 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
5629 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
5630 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
5631 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
5632 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5633 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
5634 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5635 ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5636 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5
5637 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5638 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
5639 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
5640 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5641 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5642 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5643 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5644 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
5645 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5646 ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5647 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5
5648 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5649 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
5650 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
5651 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5652 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5653 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5654 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5655 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5656 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5657 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5658 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5659 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5660 ; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5661 ; AVX2-NEXT: vmovdqa %ymm1, %ymm5
5662 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5663 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
5664 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
5665 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5666 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5667 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5668 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5669 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5670 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5671 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13
5672 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14
5673 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5674 ; AVX2-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
5675 ; AVX2-NEXT: vmovdqa %ymm1, %ymm6
5676 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5677 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
5678 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5679 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
5680 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5681 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm1
5682 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
5683 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
5684 ; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm15
5685 ; AVX2-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
5686 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
5687 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5688 ; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm7
5689 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
5690 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
5691 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
5692 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm8
5693 ; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm5
5694 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5695 ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
5696 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
5697 ; AVX2-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
5698 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
5699 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm5
5700 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5701 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
5702 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5703 ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
5704 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
5705 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
5706 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5707 ; AVX2-NEXT: vpshufb %ymm5, %ymm8, %ymm8
5708 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
5709 ; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0
5710 ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
5711 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm6
5712 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
5713 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4
5714 ; AVX2-NEXT: vpor %xmm6, %xmm4, %xmm4
5715 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
5716 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5717 ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5718 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
5719 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
5720 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5721 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
5722 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
5723 ; AVX2-NEXT: vpor %xmm4, %xmm0, %xmm0
5724 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
5725 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
5726 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5727 ; AVX2-NEXT: vpshufb %ymm4, %ymm12, %ymm12
5728 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5729 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5730 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5731 ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
5732 ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
5733 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12
5734 ; AVX2-NEXT: vpshufb %xmm5, %xmm12, %xmm5
5735 ; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
5736 ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
5737 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
5738 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5739 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5740 ; AVX2-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5741 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
5742 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5
5743 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5744 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
5745 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
5746 ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0
5747 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
5748 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
5749 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5750 ; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm12
5751 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5752 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5753 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5754 ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
5755 ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
5756 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4
5757 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
5758 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
5759 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
5760 ; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
5761 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5762 ; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
5763 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5764 ; AVX2-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5765 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
5766 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
5767 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
5768 ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
5769 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
5770 ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
5771 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5772 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
5773 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5774 ; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
5775 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm2
5776 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
5777 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm0
5778 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
5779 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm1
5780 ; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm3
5781 ; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
5782 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5783 ; AVX2-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
5784 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
5785 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5786 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5787 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3
5788 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm4
5789 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm5
5790 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm12
5791 ; AVX2-NEXT: vpor %xmm4, %xmm12, %xmm4
5792 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5793 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
5794 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
5795 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5796 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
5797 ; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm7
5798 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
5799 ; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm15
5800 ; AVX2-NEXT: vpor %xmm7, %xmm15, %xmm7
5801 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5802 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
5803 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5804 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5805 ; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm8
5806 ; AVX2-NEXT: vpshufb %xmm14, %xmm5, %xmm12
5807 ; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8
5808 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5809 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
5810 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
5811 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
5812 ; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm11
5813 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
5814 ; AVX2-NEXT: vpshufb %xmm15, %xmm1, %xmm14
5815 ; AVX2-NEXT: vpor %xmm11, %xmm14, %xmm11
5816 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
5817 ; AVX2-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
5818 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
5819 ; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm12
5820 ; AVX2-NEXT: vpshufb %xmm15, %xmm5, %xmm15
5821 ; AVX2-NEXT: vpor %xmm12, %xmm15, %xmm12
5822 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5823 ; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
5824 ; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm15
5825 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
5826 ; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm15
5827 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
5828 ; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm13
5829 ; AVX2-NEXT: vpor %xmm15, %xmm13, %xmm13
5830 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
5831 ; AVX2-NEXT: # ymm15 = mem[0,1,0,1]
5832 ; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm10
5833 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
5834 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm13
5835 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
5836 ; AVX2-NEXT: vpshufb %ymm4, %ymm13, %ymm13
5837 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
5838 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm13
5839 ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
5840 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5841 ; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm13
5842 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm15
5843 ; AVX2-NEXT: vpshufb %xmm6, %xmm15, %xmm6
5844 ; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7
5845 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
5846 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
5847 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7
5848 ; AVX2-NEXT: vpshufb %ymm4, %ymm7, %ymm4
5849 ; AVX2-NEXT: vpermd %ymm4, %ymm0, %ymm0
5850 ; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
5851 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
5852 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5853 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
5854 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
5855 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
5856 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5857 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5858 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
5859 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5860 ; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm2
5861 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm3
5862 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
5863 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5864 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5865 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
5866 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5867 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
5868 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
5869 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5870 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
5871 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5872 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
5873 ; AVX2-NEXT: vmovdqa %ymm8, (%rdx)
5874 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx)
5875 ; AVX2-NEXT: vmovdqa %ymm2, (%rcx)
5876 ; AVX2-NEXT: vmovdqa %ymm11, 32(%r8)
5877 ; AVX2-NEXT: vmovdqa %ymm12, (%r8)
5878 ; AVX2-NEXT: vmovdqa %ymm10, 32(%r9)
5879 ; AVX2-NEXT: vmovdqa %ymm0, (%r9)
5880 ; AVX2-NEXT: addq $136, %rsp
5881 ; AVX2-NEXT: vzeroupper
5884 ; AVX2-FP-LABEL: load_i8_stride5_vf64:
5886 ; AVX2-FP-NEXT: subq $136, %rsp
5887 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2
5888 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4
5889 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm10
5890 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm9
5891 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5892 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
5893 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5894 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
5895 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
5896 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
5897 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
5898 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5899 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
5900 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5901 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5902 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
5903 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5904 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
5905 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
5906 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5907 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5908 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5909 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5910 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
5911 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
5912 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5913 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
5914 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5915 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
5916 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
5917 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5918 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5919 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5920 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5921 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5922 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5923 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5924 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5925 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
5926 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
5927 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
5928 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5929 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
5930 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
5931 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5932 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5933 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
5934 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5935 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5936 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5937 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13
5938 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14
5939 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5940 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
5941 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm6
5942 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
5943 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
5944 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5945 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
5946 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5947 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm1
5948 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
5949 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
5950 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm15, %ymm15
5951 ; AVX2-FP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
5952 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
5953 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5954 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm7
5955 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
5956 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
5957 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
5958 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm8
5959 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
5960 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5961 ; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0
5962 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
5963 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
5964 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
5965 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm5
5966 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
5967 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
5968 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5969 ; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0
5970 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
5971 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
5972 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5973 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
5974 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
5975 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm11, %ymm0
5976 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
5977 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm6
5978 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
5979 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4
5980 ; AVX2-FP-NEXT: vpor %xmm6, %xmm4, %xmm4
5981 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
5982 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
5983 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
5984 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
5985 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
5986 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5987 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
5988 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
5989 ; AVX2-FP-NEXT: vpor %xmm4, %xmm0, %xmm0
5990 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
5991 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1]
5992 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5993 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm12, %ymm12
5994 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
5995 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5996 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5997 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
5998 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
5999 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm12
6000 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm12, %xmm5
6001 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
6002 ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
6003 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
6004 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6005 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6006 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
6007 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
6008 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
6009 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
6010 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
6011 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
6012 ; AVX2-FP-NEXT: vpor %xmm5, %xmm0, %xmm0
6013 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
6014 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
6015 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6016 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm12, %ymm12
6017 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6018 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6019 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6020 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
6021 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
6022 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4
6023 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
6024 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
6025 ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
6026 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
6027 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6028 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
6029 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6030 ; AVX2-FP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6031 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
6032 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
6033 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
6034 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
6035 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
6036 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
6037 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6038 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
6039 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6040 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
6041 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm2
6042 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
6043 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm0
6044 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
6045 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm1
6046 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm3
6047 ; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0
6048 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6049 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
6050 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
6051 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6052 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
6053 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm3
6054 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
6055 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5
6056 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm12
6057 ; AVX2-FP-NEXT: vpor %xmm4, %xmm12, %xmm4
6058 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6059 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
6060 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
6061 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6062 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
6063 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm7
6064 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
6065 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm15
6066 ; AVX2-FP-NEXT: vpor %xmm7, %xmm15, %xmm7
6067 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6068 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
6069 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
6070 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6071 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm8
6072 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm5, %xmm12
6073 ; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8
6074 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6075 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
6076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
6077 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
6078 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm11
6079 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
6080 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm14
6081 ; AVX2-FP-NEXT: vpor %xmm11, %xmm14, %xmm11
6082 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
6083 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
6084 ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6085 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
6086 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm15
6087 ; AVX2-FP-NEXT: vpor %xmm12, %xmm15, %xmm12
6088 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6089 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
6090 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm15
6091 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
6092 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm15
6093 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
6094 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm13
6095 ; AVX2-FP-NEXT: vpor %xmm15, %xmm13, %xmm13
6096 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
6097 ; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1]
6098 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm10, %ymm10
6099 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
6100 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm13
6101 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
6102 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm13, %ymm13
6103 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
6104 ; AVX2-FP-NEXT: vpermd %ymm13, %ymm0, %ymm13
6105 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
6106 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6107 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm13
6108 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm15
6109 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm6
6110 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
6111 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
6112 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
6113 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm7
6114 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
6115 ; AVX2-FP-NEXT: vpermd %ymm4, %ymm0, %ymm0
6116 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
6117 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
6118 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
6119 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
6120 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
6121 ; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1
6122 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6123 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6124 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
6125 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6126 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
6127 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
6128 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
6129 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6130 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6131 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
6132 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6133 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
6134 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
6135 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6136 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
6137 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6138 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx)
6139 ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rdx)
6140 ; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rcx)
6141 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx)
6142 ; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%r8)
6143 ; AVX2-FP-NEXT: vmovdqa %ymm12, (%r8)
6144 ; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%r9)
6145 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9)
6146 ; AVX2-FP-NEXT: addq $136, %rsp
6147 ; AVX2-FP-NEXT: vzeroupper
6148 ; AVX2-FP-NEXT: retq
6150 ; AVX2-FCP-LABEL: load_i8_stride5_vf64:
6151 ; AVX2-FCP: # %bb.0:
6152 ; AVX2-FCP-NEXT: subq $136, %rsp
6153 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
6154 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
6155 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10
6156 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
6157 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6158 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
6159 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6160 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
6161 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
6162 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
6163 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
6164 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6165 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
6166 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6167 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
6168 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
6169 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6170 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
6171 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
6172 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6173 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6174 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6175 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6176 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
6177 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6178 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
6179 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
6180 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6181 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
6182 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
6183 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6184 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6185 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6186 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6187 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6188 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6189 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6190 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6191 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6192 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
6193 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
6194 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6195 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
6196 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
6197 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6198 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6199 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
6200 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
6201 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
6202 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6203 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
6204 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
6205 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6206 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
6207 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm6
6208 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
6209 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
6210 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
6211 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
6212 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6213 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1
6214 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
6215 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
6216 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm15
6217 ; AVX2-FCP-NEXT: vpmovsxwd {{.*#+}} xmm2 = [4294967295,4294967295,4294967295,255]
6218 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
6219 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
6220 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7
6221 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
6222 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
6223 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
6224 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
6225 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
6226 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6227 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
6228 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
6229 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
6230 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
6231 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm5
6232 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
6233 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
6234 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6235 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
6236 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
6237 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6238 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6239 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
6240 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
6241 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0
6242 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
6243 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm6
6244 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
6245 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4
6246 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
6247 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
6248 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6249 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
6250 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
6251 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
6252 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
6253 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
6254 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
6255 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
6256 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
6257 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
6258 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6259 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12
6260 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6261 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6262 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6263 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
6264 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
6265 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm12
6266 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm5
6267 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
6268 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
6269 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
6270 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6271 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
6272 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
6273 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
6274 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
6275 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
6276 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
6277 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
6278 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm0, %xmm0
6279 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
6280 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6281 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6282 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm12
6283 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
6284 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6285 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6286 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
6287 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
6288 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm4
6289 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
6290 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
6291 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
6292 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
6293 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
6294 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
6295 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6296 ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6297 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
6298 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
6299 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6300 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
6301 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
6302 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
6303 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6304 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
6305 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
6306 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
6307 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm2
6308 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
6309 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm0
6310 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
6311 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1
6312 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3
6313 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
6314 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6315 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
6316 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
6317 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6318 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
6319 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm3
6320 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
6321 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm5
6322 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm12
6323 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm12, %xmm4
6324 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6325 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
6326 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
6327 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6328 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
6329 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm7
6330 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
6331 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm15
6332 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
6333 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6334 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
6335 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
6336 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6337 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm8
6338 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm12
6339 ; AVX2-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8
6340 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6341 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
6342 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
6343 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
6344 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm11
6345 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
6346 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm14
6347 ; AVX2-FCP-NEXT: vpor %xmm11, %xmm14, %xmm11
6348 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
6349 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,18446744073709551615,255]
6350 ; AVX2-FCP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6351 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm12
6352 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm15
6353 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12
6354 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6355 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
6356 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
6357 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
6358 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm15
6359 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
6360 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm13
6361 ; AVX2-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
6362 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
6363 ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,0,1]
6364 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm10
6365 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
6366 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm13
6367 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
6368 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm13
6369 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
6370 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm0, %ymm13
6371 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
6372 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6373 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm13
6374 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm15
6375 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm6
6376 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
6377 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
6378 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
6379 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
6380 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
6381 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm0
6382 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
6383 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
6384 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
6385 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
6386 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
6387 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
6388 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6389 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6390 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
6391 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6392 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
6393 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
6394 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
6395 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6396 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6397 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
6398 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6399 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
6400 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
6401 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6402 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
6403 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6404 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
6405 ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rdx)
6406 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rcx)
6407 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx)
6408 ; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%r8)
6409 ; AVX2-FCP-NEXT: vmovdqa %ymm12, (%r8)
6410 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%r9)
6411 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9)
6412 ; AVX2-FCP-NEXT: addq $136, %rsp
6413 ; AVX2-FCP-NEXT: vzeroupper
6414 ; AVX2-FCP-NEXT: retq
6416 ; AVX512-LABEL: load_i8_stride5_vf64:
6418 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6419 ; AVX512-NEXT: vmovdqa64 (%rdi), %ymm23
6420 ; AVX512-NEXT: vmovdqa64 32(%rdi), %ymm24
6421 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm21
6422 ; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm22
6423 ; AVX512-NEXT: vmovdqa %ymm5, %ymm4
6424 ; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
6425 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
6426 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
6427 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6428 ; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6
6429 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6430 ; AVX512-NEXT: vmovdqa %ymm4, %ymm7
6431 ; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm7
6432 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
6433 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
6434 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
6435 ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm12
6436 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6437 ; AVX512-NEXT: vpternlogq $236, %ymm19, %ymm6, %ymm12
6438 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm25
6439 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm7
6440 ; AVX512-NEXT: vmovdqa %ymm4, %ymm9
6441 ; AVX512-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm9
6442 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm8
6443 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
6444 ; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6445 ; AVX512-NEXT: vmovdqa 176(%rdi), %xmm9
6446 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
6447 ; AVX512-NEXT: vmovdqa 160(%rdi), %xmm11
6448 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
6449 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6450 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6451 ; AVX512-NEXT: vpternlogq $186, %ymm13, %ymm16, %ymm0
6452 ; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13
6453 ; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm10
6454 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm14
6455 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6456 ; AVX512-NEXT: vpor %xmm10, %xmm15, %xmm10
6457 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6458 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
6459 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6460 ; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm20, %zmm10
6461 ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm15
6462 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12
6463 ; AVX512-NEXT: vmovdqa %ymm5, %ymm2
6464 ; AVX512-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm2
6465 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
6466 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
6467 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
6468 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
6469 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6470 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6471 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
6472 ; AVX512-NEXT: vmovdqa %ymm4, %ymm0
6473 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6474 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
6475 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
6476 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6477 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
6478 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6479 ; AVX512-NEXT: vmovdqa %ymm5, %ymm2
6480 ; AVX512-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm2
6481 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
6482 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6483 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
6484 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
6485 ; AVX512-NEXT: vpor %xmm3, %xmm10, %xmm3
6486 ; AVX512-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3
6487 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6488 ; AVX512-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3
6489 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6490 ; AVX512-NEXT: vmovdqa %ymm10, %ymm0
6491 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6492 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6493 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6494 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6495 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6496 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1
6497 ; AVX512-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm1
6498 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
6499 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
6500 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
6501 ; AVX512-NEXT: vpor %xmm6, %xmm1, %xmm1
6502 ; AVX512-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm1
6503 ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6504 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6505 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
6506 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6507 ; AVX512-NEXT: vpternlogq $184, %zmm1, %zmm20, %zmm0
6508 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
6509 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0
6510 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6511 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6512 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
6513 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6514 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
6515 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6516 ; AVX512-NEXT: vmovdqa %ymm4, %ymm1
6517 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6518 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6519 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6520 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
6521 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6522 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
6523 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6524 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6525 ; AVX512-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6526 ; AVX512-NEXT: vmovdqa %ymm4, %ymm0
6527 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6528 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6529 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6530 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6531 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6532 ; AVX512-NEXT: vmovdqa %ymm10, %ymm3
6533 ; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6534 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
6535 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
6536 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6537 ; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3
6538 ; AVX512-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm3
6539 ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6540 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6541 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
6542 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6543 ; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm0
6544 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
6545 ; AVX512-NEXT: vmovdqa %ymm10, %ymm0
6546 ; AVX512-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm0
6547 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6548 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6549 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6550 ; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
6551 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6552 ; AVX512-NEXT: vmovdqa %ymm5, %ymm1
6553 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6554 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6555 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6556 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
6557 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6558 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
6559 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6560 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6561 ; AVX512-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6562 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0
6563 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6564 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6565 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6566 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6567 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6568 ; AVX512-NEXT: vmovdqa %ymm4, %ymm3
6569 ; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6570 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6571 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
6572 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6573 ; AVX512-NEXT: vpor %xmm6, %xmm3, %xmm3
6574 ; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
6575 ; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6576 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6577 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
6578 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6579 ; AVX512-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6580 ; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0
6581 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6582 ; AVX512-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12
6583 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
6584 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm3
6585 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6586 ; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1
6587 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6588 ; AVX512-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm10
6589 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10
6590 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6591 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
6592 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6593 ; AVX512-NEXT: vpor %xmm6, %xmm7, %xmm6
6594 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
6595 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6596 ; AVX512-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3
6597 ; AVX512-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm5
6598 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm1
6599 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
6600 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
6601 ; AVX512-NEXT: vpor %xmm1, %xmm5, %xmm1
6602 ; AVX512-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
6603 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6604 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
6605 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
6606 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
6607 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4
6608 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
6609 ; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
6610 ; AVX512-NEXT: vpermd %ymm4, %ymm5, %ymm4
6611 ; AVX512-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4
6612 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
6613 ; AVX512-NEXT: vmovdqa64 %zmm18, (%rsi)
6614 ; AVX512-NEXT: vmovdqa64 %zmm17, (%rdx)
6615 ; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx)
6616 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
6617 ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9)
6618 ; AVX512-NEXT: vzeroupper
6621 ; AVX512-FCP-LABEL: load_i8_stride5_vf64:
6622 ; AVX512-FCP: # %bb.0:
6623 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6624 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm23
6625 ; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24
6626 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
6627 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22
6628 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4
6629 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
6630 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
6631 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
6632 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6633 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
6634 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6635 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm7
6636 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm7
6637 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
6638 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
6639 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
6640 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12
6641 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6642 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm19, %ymm6, %ymm12
6643 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
6644 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
6645 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9
6646 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm9
6647 ; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
6648 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
6649 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6650 ; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
6651 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
6652 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
6653 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
6654 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6655 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6656 ; AVX512-FCP-NEXT: vpternlogq $186, %ymm13, %ymm16, %ymm0
6657 ; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm13
6658 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10
6659 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm14
6660 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6661 ; AVX512-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10
6662 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6663 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
6664 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6665 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm20, %zmm10
6666 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm15
6667 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
6668 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
6669 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm2
6670 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
6671 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
6672 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
6673 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
6674 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6675 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6676 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
6677 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0
6678 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6679 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
6680 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
6681 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6682 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
6683 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6684 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm2
6685 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm2
6686 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
6687 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6688 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
6689 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
6690 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
6691 ; AVX512-FCP-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3
6692 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6693 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3
6694 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6695 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
6696 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6697 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6698 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6699 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6700 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6701 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
6702 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm1
6703 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
6704 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
6705 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
6706 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
6707 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm1
6708 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6709 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6710 ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
6711 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6712 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm20, %zmm0
6713 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
6714 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
6715 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6716 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6717 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
6718 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6719 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
6720 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6721 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1
6722 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6723 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6724 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6725 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
6726 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6727 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
6728 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6729 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6730 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6731 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0
6732 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6733 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6734 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6735 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6736 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6737 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3
6738 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6739 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
6740 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
6741 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6742 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
6743 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm3
6744 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6745 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6746 ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
6747 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6748 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm0
6749 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
6750 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
6751 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm0
6752 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
6753 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6754 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6755 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
6756 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6757 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
6758 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6759 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6760 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6761 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
6762 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6763 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
6764 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6765 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6766 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6767 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
6768 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6769 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6770 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6771 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6772 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6773 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm3
6774 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6775 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6776 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
6777 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6778 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
6779 ; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
6780 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6781 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6782 ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
6783 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6784 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6785 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0
6786 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6787 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12
6788 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
6789 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
6790 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6791 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
6792 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6793 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm10
6794 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10
6795 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6796 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
6797 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6798 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
6799 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
6800 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6801 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3
6802 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm5
6803 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
6804 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
6805 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
6806 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1
6807 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
6808 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6809 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
6810 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
6811 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
6812 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
6813 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
6814 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
6815 ; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
6816 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4
6817 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
6818 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
6819 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
6820 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
6821 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6822 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
6823 ; AVX512-FCP-NEXT: vzeroupper
6824 ; AVX512-FCP-NEXT: retq
6826 ; AVX512DQ-LABEL: load_i8_stride5_vf64:
6827 ; AVX512DQ: # %bb.0:
6828 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
6829 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm23
6830 ; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm24
6831 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm21
6832 ; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm22
6833 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm4
6834 ; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
6835 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
6836 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
6837 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
6838 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm6
6839 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
6840 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm7
6841 ; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm7
6842 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
6843 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
6844 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
6845 ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm12
6846 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
6847 ; AVX512DQ-NEXT: vpternlogq $236, %ymm19, %ymm6, %ymm12
6848 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm25
6849 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm7
6850 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm9
6851 ; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm9
6852 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm8
6853 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
6854 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
6855 ; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm9
6856 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
6857 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm11
6858 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
6859 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6860 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
6861 ; AVX512DQ-NEXT: vpternlogq $186, %ymm13, %ymm16, %ymm0
6862 ; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13
6863 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm10
6864 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm14
6865 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
6866 ; AVX512DQ-NEXT: vpor %xmm10, %xmm15, %xmm10
6867 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6868 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
6869 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
6870 ; AVX512DQ-NEXT: vpternlogq $184, %zmm12, %zmm20, %zmm10
6871 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm15
6872 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12
6873 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
6874 ; AVX512DQ-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm2
6875 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
6876 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
6877 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
6878 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
6879 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6880 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
6881 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
6882 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0
6883 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6884 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
6885 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
6886 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
6887 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
6888 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6889 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
6890 ; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm2
6891 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
6892 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
6893 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
6894 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
6895 ; AVX512DQ-NEXT: vpor %xmm3, %xmm10, %xmm3
6896 ; AVX512DQ-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3
6897 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6898 ; AVX512DQ-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3
6899 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
6900 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0
6901 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6902 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6903 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6904 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
6905 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6906 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
6907 ; AVX512DQ-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm1
6908 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
6909 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
6910 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
6911 ; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1
6912 ; AVX512DQ-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm1
6913 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6914 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
6915 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
6916 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6917 ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm20, %zmm0
6918 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
6919 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
6920 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
6921 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
6922 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
6923 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
6924 ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
6925 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6926 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm1
6927 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6928 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6929 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
6930 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
6931 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6932 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
6933 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6934 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6935 ; AVX512DQ-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6936 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm0
6937 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6938 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6939 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6940 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
6941 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6942 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm3
6943 ; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6944 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6
6945 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
6946 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
6947 ; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3
6948 ; AVX512DQ-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm3
6949 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6950 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
6951 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
6952 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6953 ; AVX512DQ-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm0
6954 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
6955 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm0
6956 ; AVX512DQ-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm0
6957 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
6958 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
6959 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
6960 ; AVX512DQ-NEXT: vpor %xmm1, %xmm0, %xmm0
6961 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6962 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
6963 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
6964 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
6965 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
6966 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
6967 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
6968 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
6969 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
6970 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6971 ; AVX512DQ-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
6972 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
6973 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
6974 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6975 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
6976 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
6977 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
6978 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3
6979 ; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
6980 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
6981 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3
6982 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
6983 ; AVX512DQ-NEXT: vpor %xmm6, %xmm3, %xmm3
6984 ; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
6985 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm0
6986 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
6987 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
6988 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6989 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
6990 ; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0
6991 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
6992 ; AVX512DQ-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12
6993 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
6994 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm3
6995 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
6996 ; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1
6997 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6998 ; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm10
6999 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10
7000 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7001 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
7002 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
7003 ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
7004 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
7005 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
7006 ; AVX512DQ-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3
7007 ; AVX512DQ-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm5
7008 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm1
7009 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
7010 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
7011 ; AVX512DQ-NEXT: vpor %xmm1, %xmm5, %xmm1
7012 ; AVX512DQ-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
7013 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7014 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
7015 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7016 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
7017 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4
7018 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7019 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
7020 ; AVX512DQ-NEXT: vpermd %ymm4, %ymm5, %ymm4
7021 ; AVX512DQ-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4
7022 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
7023 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rsi)
7024 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rdx)
7025 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx)
7026 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
7027 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9)
7028 ; AVX512DQ-NEXT: vzeroupper
7029 ; AVX512DQ-NEXT: retq
7031 ; AVX512DQ-FCP-LABEL: load_i8_stride5_vf64:
7032 ; AVX512DQ-FCP: # %bb.0:
7033 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
7034 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm23
7035 ; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm24
7036 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm21
7037 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm22
7038 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm4
7039 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
7040 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
7041 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm6
7042 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,6,11,16,21,26,31,20,25,30,19,24,29,128,128,128,128,128,128]
7043 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
7044 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
7045 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm7
7046 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm7
7047 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
7048 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[2,7,12,u,u,u]
7049 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u]
7050 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm12
7051 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
7052 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm19, %ymm6, %ymm12
7053 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
7054 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
7055 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9
7056 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm9
7057 ; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
7058 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
7059 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,ymm9[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7060 ; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
7061 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
7062 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
7063 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
7064 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
7065 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
7066 ; AVX512DQ-FCP-NEXT: vpternlogq $186, %ymm13, %ymm16, %ymm0
7067 ; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm13
7068 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm10
7069 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm14
7070 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7071 ; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm15, %xmm10
7072 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
7073 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm10
7074 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
7075 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm20, %zmm10
7076 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm15
7077 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
7078 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
7079 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm2
7080 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7081 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11]
7082 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero
7083 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
7084 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7085 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
7086 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm18
7087 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0
7088 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
7089 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
7090 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
7091 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
7092 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
7093 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7094 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
7095 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm2
7096 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
7097 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7098 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
7099 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
7100 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
7101 ; AVX512DQ-FCP-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3
7102 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
7103 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm3
7104 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
7105 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0
7106 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
7107 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7108 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
7109 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,2,7,12,17,22,27,16,21,26,31,20,25,30,128,128,128,128,128,128]
7110 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
7111 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
7112 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm1
7113 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
7114 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
7115 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
7116 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
7117 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm1
7118 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
7119 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7120 ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
7121 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7122 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm1, %zmm20, %zmm0
7123 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
7124 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
7125 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm0
7126 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
7127 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
7128 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
7129 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
7130 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7131 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1
7132 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
7133 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
7134 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7135 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
7136 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
7137 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
7138 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7139 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7140 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
7141 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0
7142 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
7143 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7144 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
7145 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,3,8,13,18,23,28,17,22,27,16,21,26,31,128,128,128,128,128,128]
7146 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
7147 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3
7148 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
7149 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
7150 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[1,6,11],zero,zero,zero,zero,xmm6[4,9,14,u,u,u]
7151 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
7152 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
7153 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm19, %ymm2, %ymm3
7154 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
7155 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7156 ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
7157 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7158 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm0
7159 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
7160 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0
7161 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm0
7162 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
7163 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
7164 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
7165 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
7166 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7167 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
7168 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm25, %ymm1
7169 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm1
7170 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7171 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
7172 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
7173 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
7174 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7175 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7176 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm0, %ymm16, %ymm1
7177 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
7178 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm0
7179 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7180 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
7181 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7182 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
7183 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm3
7184 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm3
7185 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[u,u,u]
7186 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
7187 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
7188 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm3, %xmm3
7189 ; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
7190 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
7191 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7192 ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
7193 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7194 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
7195 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm2, %zmm0
7196 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
7197 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm15, %ymm4, %ymm12
7198 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
7199 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
7200 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7201 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
7202 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7203 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm7, %ymm10
7204 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10
7205 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7206 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
7207 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
7208 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
7209 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
7210 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
7211 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3
7212 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm5
7213 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
7214 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
7215 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
7216 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm5, %xmm1
7217 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm4
7218 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7219 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm5
7220 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7221 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7]
7222 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
7223 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7224 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
7225 ; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
7226 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm1, %zmm2, %zmm4
7227 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
7228 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
7229 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
7230 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
7231 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
7232 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
7233 ; AVX512DQ-FCP-NEXT: vzeroupper
7234 ; AVX512DQ-FCP-NEXT: retq
7236 ; AVX512BW-LABEL: load_i8_stride5_vf64:
7237 ; AVX512BW: # %bb.0:
7238 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
7239 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
7240 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0
7241 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1
7242 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
7243 ; AVX512BW-NEXT: kmovd %eax, %k2
7244 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7245 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7246 ; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
7247 ; AVX512BW-NEXT: kmovd %eax, %k1
7248 ; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1}
7249 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
7250 ; AVX512BW-NEXT: kmovd %eax, %k1
7251 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7252 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
7253 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7254 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7255 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm10
7256 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
7257 ; AVX512BW-NEXT: kmovd %eax, %k5
7258 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7259 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5
7260 ; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4
7261 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
7262 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
7263 ; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084
7264 ; AVX512BW-NEXT: kmovd %eax, %k3
7265 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
7266 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7267 ; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm6
7268 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
7269 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm7
7270 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
7271 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
7272 ; AVX512BW-NEXT: movl $127, %eax
7273 ; AVX512BW-NEXT: kmovd %eax, %k4
7274 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
7275 ; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12
7276 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
7277 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13
7278 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7279 ; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9
7280 ; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7281 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
7282 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
7283 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7284 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9
7285 ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8
7286 ; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
7287 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
7288 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
7289 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
7290 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
7291 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7292 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
7293 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
7294 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
7295 ; AVX512BW-NEXT: kmovd %eax, %k3
7296 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
7297 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
7298 ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
7299 ; AVX512BW-NEXT: kmovd %eax, %k6
7300 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
7301 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
7302 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
7303 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
7304 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
7305 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
7306 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7307 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
7308 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
7309 ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108
7310 ; AVX512BW-NEXT: kmovd %eax, %k6
7311 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
7312 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7313 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
7314 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
7315 ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
7316 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7317 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
7318 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7319 ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
7320 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7321 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
7322 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
7323 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11
7324 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
7325 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7326 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
7327 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
7328 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7329 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7330 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000
7331 ; AVX512BW-NEXT: kmovd %eax, %k4
7332 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7333 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
7334 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
7335 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
7336 ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000
7337 ; AVX512BW-NEXT: kmovd %eax, %k6
7338 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
7339 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
7340 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7341 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
7342 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
7343 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7344 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7345 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
7346 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
7347 ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210
7348 ; AVX512BW-NEXT: kmovd %eax, %k6
7349 ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
7350 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7351 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
7352 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7353 ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10
7354 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
7355 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
7356 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
7357 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7358 ; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14
7359 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7360 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
7361 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
7362 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10
7363 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
7364 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
7365 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
7366 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
7367 ; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14
7368 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7369 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
7370 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
7371 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
7372 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
7373 ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000
7374 ; AVX512BW-NEXT: kmovd %eax, %k5
7375 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
7376 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
7377 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
7378 ; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15
7379 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
7380 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7381 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
7382 ; AVX512BW-NEXT: kmovd %eax, %k5
7383 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7384 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
7385 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
7386 ; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421
7387 ; AVX512BW-NEXT: kmovd %eax, %k5
7388 ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
7389 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7390 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
7391 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7392 ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11
7393 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
7394 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
7395 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
7396 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7397 ; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
7398 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
7399 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
7400 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
7401 ; AVX512BW-NEXT: kmovq %rax, %k5
7402 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
7403 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7404 ; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
7405 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13
7406 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
7407 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
7408 ; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12
7409 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
7410 ; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
7411 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
7412 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
7413 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
7414 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7415 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7416 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
7417 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
7418 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7419 ; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000
7420 ; AVX512BW-NEXT: kmovd %eax, %k2
7421 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
7422 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7423 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7424 ; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
7425 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
7426 ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842
7427 ; AVX512BW-NEXT: kmovd %eax, %k2
7428 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
7429 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7430 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
7431 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7432 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
7433 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7434 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7435 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
7436 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7437 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
7438 ; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
7439 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7440 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
7441 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7442 ; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
7443 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
7444 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm3
7445 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7446 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
7447 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7448 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
7449 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7450 ; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi)
7451 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
7452 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx)
7453 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8)
7454 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9)
7455 ; AVX512BW-NEXT: vzeroupper
7456 ; AVX512BW-NEXT: retq
7458 ; AVX512BW-FCP-LABEL: load_i8_stride5_vf64:
7459 ; AVX512BW-FCP: # %bb.0:
7460 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
7461 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
7462 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
7463 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
7464 ; AVX512BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
7465 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
7466 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7467 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7468 ; AVX512BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000
7469 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
7470 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1}
7471 ; AVX512BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
7472 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
7473 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7474 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
7475 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7476 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7477 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10
7478 ; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
7479 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
7480 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7481 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
7482 ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
7483 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
7484 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
7485 ; AVX512BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084
7486 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
7487 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
7488 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7489 ; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
7490 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
7491 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
7492 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
7493 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
7494 ; AVX512BW-FCP-NEXT: movl $127, %eax
7495 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4
7496 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
7497 ; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
7498 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
7499 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
7500 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7501 ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
7502 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7503 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
7504 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
7505 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7506 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
7507 ; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
7508 ; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
7509 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
7510 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
7511 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
7512 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7513 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7514 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
7515 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
7516 ; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
7517 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
7518 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
7519 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
7520 ; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
7521 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6
7522 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
7523 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
7524 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
7525 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
7526 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
7527 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7528 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7529 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
7530 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
7531 ; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
7532 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6
7533 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
7534 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7535 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
7536 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
7537 ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
7538 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7539 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
7540 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7541 ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
7542 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7543 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
7544 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
7545 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11
7546 ; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
7547 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7548 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
7549 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
7550 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
7551 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7552 ; AVX512BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000
7553 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4
7554 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7555 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
7556 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
7557 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
7558 ; AVX512BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
7559 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6
7560 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
7561 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
7562 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7563 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
7564 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
7565 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
7566 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7567 ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
7568 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
7569 ; AVX512BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210
7570 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6
7571 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
7572 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7573 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
7574 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7575 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10
7576 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
7577 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
7578 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
7579 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7580 ; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
7581 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7582 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
7583 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
7584 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10
7585 ; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
7586 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
7587 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
7588 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
7589 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14
7590 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7591 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
7592 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
7593 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
7594 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
7595 ; AVX512BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
7596 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
7597 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
7598 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
7599 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
7600 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
7601 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
7602 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
7603 ; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
7604 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
7605 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7606 ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
7607 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
7608 ; AVX512BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421
7609 ; AVX512BW-FCP-NEXT: kmovd %eax, %k5
7610 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
7611 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7612 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
7613 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7614 ; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11
7615 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
7616 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
7617 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
7618 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7619 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
7620 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
7621 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
7622 ; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
7623 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5
7624 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
7625 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7626 ; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
7627 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
7628 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
7629 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
7630 ; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
7631 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
7632 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
7633 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
7634 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
7635 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7636 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7637 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7638 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
7639 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
7640 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7641 ; AVX512BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000
7642 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
7643 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
7644 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7645 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7646 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
7647 ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
7648 ; AVX512BW-FCP-NEXT: movl $2114, %eax # imm = 0x842
7649 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
7650 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
7651 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7652 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
7653 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7654 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
7655 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7656 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7657 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
7658 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7659 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
7660 ; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
7661 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7662 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
7663 ; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7664 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
7665 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
7666 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3
7667 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7668 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
7669 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7670 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
7671 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
7673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
7674 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
7675 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8)
7676 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
7677 ; AVX512BW-FCP-NEXT: vzeroupper
7678 ; AVX512BW-FCP-NEXT: retq
7680 ; AVX512DQ-BW-LABEL: load_i8_stride5_vf64:
7681 ; AVX512DQ-BW: # %bb.0:
7682 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3
7683 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
7684 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm0
7685 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm1
7686 ; AVX512DQ-BW-NEXT: movw $21140, %ax # imm = 0x5294
7687 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
7688 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7689 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7690 ; AVX512DQ-BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
7691 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
7692 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1}
7693 ; AVX512DQ-BW-NEXT: movw $19026, %ax # imm = 0x4A52
7694 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
7695 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7696 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
7697 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7698 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7699 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm10
7700 ; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
7701 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
7702 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7703 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm5
7704 ; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %ymm4
7705 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
7706 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
7707 ; AVX512DQ-BW-NEXT: movl $4228, %eax # imm = 0x1084
7708 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
7709 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
7710 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7711 ; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm6
7712 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
7713 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm7
7714 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
7715 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
7716 ; AVX512DQ-BW-NEXT: movl $127, %eax
7717 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4
7718 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
7719 ; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm12
7720 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
7721 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm13
7722 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7723 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9
7724 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7725 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
7726 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
7727 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7728 ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm9
7729 ; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm8
7730 ; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
7731 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
7732 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
7733 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
7734 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
7735 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7736 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
7737 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
7738 ; AVX512DQ-BW-NEXT: movw $10570, %ax # imm = 0x294A
7739 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
7740 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
7741 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
7742 ; AVX512DQ-BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
7743 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6
7744 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
7745 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
7746 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
7747 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
7748 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
7749 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
7750 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7751 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
7752 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
7753 ; AVX512DQ-BW-NEXT: movl $8456, %eax # imm = 0x2108
7754 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6
7755 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
7756 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7757 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
7758 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
7759 ; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15
7760 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7761 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
7762 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7763 ; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15
7764 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7765 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
7766 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
7767 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11
7768 ; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
7769 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7770 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
7771 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
7772 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7773 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7774 ; AVX512DQ-BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000
7775 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4
7776 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7777 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
7778 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
7779 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
7780 ; AVX512DQ-BW-NEXT: movl $138543104, %eax # imm = 0x8420000
7781 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6
7782 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
7783 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
7784 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7785 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
7786 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
7787 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7788 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
7789 ; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
7790 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
7791 ; AVX512DQ-BW-NEXT: movl $16912, %eax # imm = 0x4210
7792 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6
7793 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
7794 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
7795 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
7796 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7797 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm10
7798 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
7799 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
7800 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
7801 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
7802 ; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm16, %xmm14
7803 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7804 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
7805 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
7806 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10
7807 ; AVX512DQ-BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
7808 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
7809 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
7810 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
7811 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm14, %xmm14
7812 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7813 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
7814 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
7815 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
7816 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
7817 ; AVX512DQ-BW-NEXT: movl $277086208, %eax # imm = 0x10840000
7818 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
7819 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
7820 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
7821 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
7822 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15
7823 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
7824 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm15, %xmm15
7825 ; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
7826 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
7827 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
7828 ; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
7829 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
7830 ; AVX512DQ-BW-NEXT: movl $33825, %eax # imm = 0x8421
7831 ; AVX512DQ-BW-NEXT: kmovd %eax, %k5
7832 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
7833 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
7834 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
7835 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7836 ; AVX512DQ-BW-NEXT: vporq %xmm16, %xmm17, %xmm11
7837 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
7838 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
7839 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
7840 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
7841 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
7842 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
7843 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
7844 ; AVX512DQ-BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
7845 ; AVX512DQ-BW-NEXT: kmovq %rax, %k5
7846 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
7847 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7848 ; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
7849 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13
7850 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
7851 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
7852 ; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12
7853 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
7854 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
7855 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
7856 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
7857 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
7858 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
7859 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
7860 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
7861 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
7862 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7863 ; AVX512DQ-BW-NEXT: movl $554172416, %eax # imm = 0x21080000
7864 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
7865 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
7866 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
7867 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
7868 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
7869 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
7870 ; AVX512DQ-BW-NEXT: movl $2114, %eax # imm = 0x842
7871 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
7872 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
7873 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7874 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
7875 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
7876 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
7877 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
7878 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7879 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm2
7880 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
7881 ; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
7882 ; AVX512DQ-BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
7883 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
7884 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
7885 ; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0
7886 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
7887 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
7888 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm3
7889 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
7890 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
7891 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7892 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
7893 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
7894 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%rsi)
7895 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx)
7896 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rcx)
7897 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%r8)
7898 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r9)
7899 ; AVX512DQ-BW-NEXT: vzeroupper
7900 ; AVX512DQ-BW-NEXT: retq
7902 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride5_vf64:
7903 ; AVX512DQ-BW-FCP: # %bb.0:
7904 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
7905 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
7906 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
7907 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
7908 ; AVX512DQ-BW-FCP-NEXT: movw $21140, %ax # imm = 0x5294
7909 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
7910 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
7911 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
7912 ; AVX512DQ-BW-FCP-NEXT: movl $1108344832, %eax # imm = 0x42100000
7913 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
7914 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1}
7915 ; AVX512DQ-BW-FCP-NEXT: movw $19026, %ax # imm = 0x4A52
7916 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
7917 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
7918 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
7919 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
7920 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
7921 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm10
7922 ; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000
7923 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
7924 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
7925 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
7926 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
7927 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
7928 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
7929 ; AVX512DQ-BW-FCP-NEXT: movl $4228, %eax # imm = 0x1084
7930 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
7931 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
7932 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
7933 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
7934 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
7935 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
7936 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
7937 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
7938 ; AVX512DQ-BW-FCP-NEXT: movl $127, %eax
7939 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
7940 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
7941 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
7942 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
7943 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
7944 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
7945 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
7946 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7947 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
7948 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
7949 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
7950 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
7951 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
7952 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
7953 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
7954 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
7955 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
7956 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7957 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7958 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
7959 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
7960 ; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
7961 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
7962 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
7963 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
7964 ; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
7965 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
7966 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
7967 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
7968 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
7969 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
7970 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
7971 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7972 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
7973 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
7974 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
7975 ; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
7976 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
7977 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
7978 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
7979 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
7980 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
7981 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
7982 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7983 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
7984 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
7985 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm16, %xmm15
7986 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7987 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
7988 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
7989 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm14, %ymm11
7990 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
7991 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
7992 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
7993 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
7994 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
7995 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
7996 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %eax # imm = 0xFFF80000
7997 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
7998 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
7999 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
8000 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
8001 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
8002 ; AVX512DQ-BW-FCP-NEXT: movl $138543104, %eax # imm = 0x8420000
8003 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
8004 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
8005 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
8006 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
8007 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
8008 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
8009 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
8010 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
8011 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
8012 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
8013 ; AVX512DQ-BW-FCP-NEXT: movl $16912, %eax # imm = 0x4210
8014 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
8015 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
8016 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
8017 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
8018 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
8019 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm10
8020 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
8021 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
8022 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
8023 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
8024 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
8025 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
8026 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
8027 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
8028 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm10
8029 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
8030 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
8031 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
8032 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
8033 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm14, %xmm14
8034 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
8035 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
8036 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
8037 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
8038 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
8039 ; AVX512DQ-BW-FCP-NEXT: movl $277086208, %eax # imm = 0x10840000
8040 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
8041 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
8042 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
8043 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
8044 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
8045 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
8046 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
8047 ; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000
8048 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
8049 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
8050 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
8051 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
8052 ; AVX512DQ-BW-FCP-NEXT: movl $33825, %eax # imm = 0x8421
8053 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k5
8054 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
8055 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
8056 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
8057 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
8058 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm11
8059 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
8060 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
8061 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
8062 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
8063 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
8064 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
8065 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
8066 ; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
8067 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
8068 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
8069 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm10, %ymm11
8070 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
8071 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
8072 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
8073 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
8074 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
8075 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
8076 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
8077 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
8078 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
8079 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
8080 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
8081 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
8082 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
8083 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
8084 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
8085 ; AVX512DQ-BW-FCP-NEXT: movl $554172416, %eax # imm = 0x21080000
8086 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
8087 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
8088 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
8089 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
8090 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
8091 ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
8092 ; AVX512DQ-BW-FCP-NEXT: movl $2114, %eax # imm = 0x842
8093 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
8094 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
8095 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
8096 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
8097 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
8098 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
8099 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
8100 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
8101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
8102 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
8103 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
8104 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
8105 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
8106 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
8107 ; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm0
8108 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
8109 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
8110 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm3
8111 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
8112 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
8113 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
8114 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
8115 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
8116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
8117 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
8118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
8119 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%r8)
8120 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
8121 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
8122 ; AVX512DQ-BW-FCP-NEXT: retq
8123 %wide.vec = load <320 x i8>, ptr %in.vec, align 64
8124 %strided.vec0 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315>
8125 %strided.vec1 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316>
8126 %strided.vec2 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317>
8127 %strided.vec3 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318>
8128 %strided.vec4 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319>
8129 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
8130 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
8131 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
8132 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
8133 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64