1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19 ; SSE-LABEL: load_i8_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movdqa (%rdi), %xmm3
24 ; SSE-NEXT: pxor %xmm4, %xmm4
25 ; SSE-NEXT: movdqa %xmm3, %xmm2
26 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
27 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
28 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
29 ; SSE-NEXT: packuswb %xmm1, %xmm1
30 ; SSE-NEXT: movdqa %xmm2, %xmm0
31 ; SSE-NEXT: psrld $16, %xmm0
32 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
33 ; SSE-NEXT: movdqa %xmm0, %xmm4
34 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
35 ; SSE-NEXT: packuswb %xmm4, %xmm4
36 ; SSE-NEXT: movdqa %xmm2, %xmm6
37 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
38 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,1,2,3]
39 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
40 ; SSE-NEXT: packuswb %xmm5, %xmm5
41 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
42 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
43 ; SSE-NEXT: packuswb %xmm6, %xmm6
44 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3]
45 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
46 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
47 ; SSE-NEXT: psrlq $48, %xmm3
48 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
49 ; SSE-NEXT: packuswb %xmm7, %xmm7
50 ; SSE-NEXT: packuswb %xmm0, %xmm0
51 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
52 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
53 ; SSE-NEXT: packuswb %xmm2, %xmm2
54 ; SSE-NEXT: movd %xmm1, %edi
55 ; SSE-NEXT: movw %di, (%rsi)
56 ; SSE-NEXT: movd %xmm4, %esi
57 ; SSE-NEXT: movw %si, (%rdx)
58 ; SSE-NEXT: movd %xmm5, %edx
59 ; SSE-NEXT: movw %dx, (%rcx)
60 ; SSE-NEXT: movd %xmm6, %ecx
61 ; SSE-NEXT: movw %cx, (%r8)
62 ; SSE-NEXT: movd %xmm7, %ecx
63 ; SSE-NEXT: movw %cx, (%r9)
64 ; SSE-NEXT: movd %xmm0, %ecx
65 ; SSE-NEXT: movw %cx, (%r10)
66 ; SSE-NEXT: movd %xmm2, %ecx
67 ; SSE-NEXT: movw %cx, (%rax)
70 ; AVX-LABEL: load_i8_stride7_vf2:
72 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
73 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
74 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
75 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
76 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
77 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
78 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
79 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
80 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
81 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
82 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
83 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
84 ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx)
85 ; AVX-NEXT: vpextrw $0, %xmm4, (%r8)
86 ; AVX-NEXT: vpextrw $0, %xmm5, (%r9)
87 ; AVX-NEXT: vpextrw $0, %xmm6, (%r10)
88 ; AVX-NEXT: vpextrw $0, %xmm0, (%rax)
91 ; AVX2-LABEL: load_i8_stride7_vf2:
93 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
94 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
95 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
96 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
97 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
98 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
99 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
100 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
101 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
102 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
103 ; AVX2-NEXT: vpextrw $0, %xmm1, (%rsi)
104 ; AVX2-NEXT: vpextrw $0, %xmm2, (%rdx)
105 ; AVX2-NEXT: vpextrw $0, %xmm3, (%rcx)
106 ; AVX2-NEXT: vpextrw $0, %xmm4, (%r8)
107 ; AVX2-NEXT: vpextrw $0, %xmm5, (%r9)
108 ; AVX2-NEXT: vpextrw $0, %xmm6, (%r10)
109 ; AVX2-NEXT: vpextrw $0, %xmm0, (%rax)
112 ; AVX2-FP-LABEL: load_i8_stride7_vf2:
114 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
115 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
116 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
117 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
118 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
119 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
120 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
121 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
122 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
123 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
124 ; AVX2-FP-NEXT: vpextrw $0, %xmm1, (%rsi)
125 ; AVX2-FP-NEXT: vpextrw $0, %xmm2, (%rdx)
126 ; AVX2-FP-NEXT: vpextrw $0, %xmm3, (%rcx)
127 ; AVX2-FP-NEXT: vpextrw $0, %xmm4, (%r8)
128 ; AVX2-FP-NEXT: vpextrw $0, %xmm5, (%r9)
129 ; AVX2-FP-NEXT: vpextrw $0, %xmm6, (%r10)
130 ; AVX2-FP-NEXT: vpextrw $0, %xmm0, (%rax)
133 ; AVX2-FCP-LABEL: load_i8_stride7_vf2:
135 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
136 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
137 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
138 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
139 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
140 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
141 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
142 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
143 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
144 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
145 ; AVX2-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
146 ; AVX2-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
147 ; AVX2-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
148 ; AVX2-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
149 ; AVX2-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
150 ; AVX2-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
151 ; AVX2-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
152 ; AVX2-FCP-NEXT: retq
154 ; AVX512-LABEL: load_i8_stride7_vf2:
156 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
157 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
158 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
159 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
160 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
161 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
162 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
163 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
164 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
165 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
166 ; AVX512-NEXT: vpextrw $0, %xmm1, (%rsi)
167 ; AVX512-NEXT: vpextrw $0, %xmm2, (%rdx)
168 ; AVX512-NEXT: vpextrw $0, %xmm3, (%rcx)
169 ; AVX512-NEXT: vpextrw $0, %xmm4, (%r8)
170 ; AVX512-NEXT: vpextrw $0, %xmm5, (%r9)
171 ; AVX512-NEXT: vpextrw $0, %xmm6, (%r10)
172 ; AVX512-NEXT: vpextrw $0, %xmm0, (%rax)
175 ; AVX512-FCP-LABEL: load_i8_stride7_vf2:
176 ; AVX512-FCP: # %bb.0:
177 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
178 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
179 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
180 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
181 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
182 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
183 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
184 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
185 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
186 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
187 ; AVX512-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
188 ; AVX512-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
189 ; AVX512-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
190 ; AVX512-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
191 ; AVX512-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
192 ; AVX512-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
193 ; AVX512-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
194 ; AVX512-FCP-NEXT: retq
196 ; AVX512DQ-LABEL: load_i8_stride7_vf2:
198 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
199 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
200 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
201 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
202 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
203 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
204 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
205 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
206 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
207 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
208 ; AVX512DQ-NEXT: vpextrw $0, %xmm1, (%rsi)
209 ; AVX512DQ-NEXT: vpextrw $0, %xmm2, (%rdx)
210 ; AVX512DQ-NEXT: vpextrw $0, %xmm3, (%rcx)
211 ; AVX512DQ-NEXT: vpextrw $0, %xmm4, (%r8)
212 ; AVX512DQ-NEXT: vpextrw $0, %xmm5, (%r9)
213 ; AVX512DQ-NEXT: vpextrw $0, %xmm6, (%r10)
214 ; AVX512DQ-NEXT: vpextrw $0, %xmm0, (%rax)
215 ; AVX512DQ-NEXT: retq
217 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf2:
218 ; AVX512DQ-FCP: # %bb.0:
219 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
220 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
221 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
222 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
223 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
224 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
225 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
226 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
227 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
228 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
229 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
230 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
231 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
232 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
233 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
234 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
235 ; AVX512DQ-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
236 ; AVX512DQ-FCP-NEXT: retq
238 ; AVX512BW-LABEL: load_i8_stride7_vf2:
240 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
241 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
242 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
243 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
244 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
245 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
246 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
247 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
248 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
249 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
250 ; AVX512BW-NEXT: vpextrw $0, %xmm1, (%rsi)
251 ; AVX512BW-NEXT: vpextrw $0, %xmm2, (%rdx)
252 ; AVX512BW-NEXT: vpextrw $0, %xmm3, (%rcx)
253 ; AVX512BW-NEXT: vpextrw $0, %xmm4, (%r8)
254 ; AVX512BW-NEXT: vpextrw $0, %xmm5, (%r9)
255 ; AVX512BW-NEXT: vpextrw $0, %xmm6, (%r10)
256 ; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rax)
257 ; AVX512BW-NEXT: retq
259 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf2:
260 ; AVX512BW-FCP: # %bb.0:
261 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
262 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
263 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
264 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
265 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
266 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
267 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
268 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
269 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
270 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
271 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
272 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
273 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
274 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
275 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
276 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
277 ; AVX512BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
278 ; AVX512BW-FCP-NEXT: retq
280 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf2:
281 ; AVX512DQ-BW: # %bb.0:
282 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
283 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
284 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
285 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
286 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
287 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
288 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
289 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
290 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
291 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
292 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm1, (%rsi)
293 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm2, (%rdx)
294 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm3, (%rcx)
295 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm4, (%r8)
296 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm5, (%r9)
297 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm6, (%r10)
298 ; AVX512DQ-BW-NEXT: vpextrw $0, %xmm0, (%rax)
299 ; AVX512DQ-BW-NEXT: retq
301 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf2:
302 ; AVX512DQ-BW-FCP: # %bb.0:
303 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
304 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
305 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
306 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
307 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
308 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
309 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
310 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
311 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
312 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
313 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm1, (%rsi)
314 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm2, (%rdx)
315 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm3, (%rcx)
316 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm4, (%r8)
317 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm5, (%r9)
318 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm6, (%r10)
319 ; AVX512DQ-BW-FCP-NEXT: vpextrw $0, %xmm0, (%rax)
320 ; AVX512DQ-BW-FCP-NEXT: retq
321 %wide.vec = load <14 x i8>, ptr %in.vec, align 64
322 %strided.vec0 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 0, i32 7>
323 %strided.vec1 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 1, i32 8>
324 %strided.vec2 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 2, i32 9>
325 %strided.vec3 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 3, i32 10>
326 %strided.vec4 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 4, i32 11>
327 %strided.vec5 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 5, i32 12>
328 %strided.vec6 = shufflevector <14 x i8> %wide.vec, <14 x i8> poison, <2 x i32> <i32 6, i32 13>
329 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
330 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
331 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
332 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
333 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
334 store <2 x i8> %strided.vec5, ptr %out.vec5, align 64
335 store <2 x i8> %strided.vec6, ptr %out.vec6, align 64
339 define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
340 ; SSE-LABEL: load_i8_stride7_vf4:
342 ; SSE-NEXT: movdqa (%rdi), %xmm4
343 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
344 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,65535,65535,65535]
345 ; SSE-NEXT: movdqa %xmm4, %xmm1
346 ; SSE-NEXT: pand %xmm3, %xmm1
347 ; SSE-NEXT: pandn %xmm0, %xmm3
348 ; SSE-NEXT: por %xmm1, %xmm3
349 ; SSE-NEXT: pxor %xmm1, %xmm1
350 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
351 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535]
352 ; SSE-NEXT: pand %xmm2, %xmm3
353 ; SSE-NEXT: movdqa %xmm0, %xmm5
354 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm4[0,0]
355 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,3]
356 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,65535,65535]
357 ; SSE-NEXT: movdqa %xmm0, %xmm8
358 ; SSE-NEXT: pand %xmm7, %xmm8
359 ; SSE-NEXT: pandn %xmm4, %xmm7
360 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,0,65535,65535]
361 ; SSE-NEXT: movdqa %xmm0, %xmm9
362 ; SSE-NEXT: pand %xmm6, %xmm9
363 ; SSE-NEXT: pandn %xmm4, %xmm6
364 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
365 ; SSE-NEXT: movdqa %xmm0, %xmm14
366 ; SSE-NEXT: pand %xmm13, %xmm14
367 ; SSE-NEXT: pandn %xmm4, %xmm13
368 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[1,3,2,3]
369 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
370 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535]
371 ; SSE-NEXT: pand %xmm11, %xmm0
372 ; SSE-NEXT: pandn %xmm4, %xmm11
373 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
374 ; SSE-NEXT: pandn %xmm4, %xmm2
375 ; SSE-NEXT: por %xmm3, %xmm2
376 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,65535,65535,0]
377 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
378 ; SSE-NEXT: pand %xmm15, %xmm5
379 ; SSE-NEXT: pandn %xmm4, %xmm15
380 ; SSE-NEXT: por %xmm5, %xmm15
381 ; SSE-NEXT: por %xmm8, %xmm7
382 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
383 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[0,2,2,3,4,5,6,7]
384 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
385 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[1,1,0,3,4,5,6,7]
386 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
387 ; SSE-NEXT: pand %xmm3, %xmm5
388 ; SSE-NEXT: pandn %xmm4, %xmm3
389 ; SSE-NEXT: por %xmm5, %xmm3
390 ; SSE-NEXT: por %xmm9, %xmm6
391 ; SSE-NEXT: movdqa %xmm6, %xmm4
392 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
393 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
394 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
395 ; SSE-NEXT: por %xmm14, %xmm13
396 ; SSE-NEXT: movdqa %xmm13, %xmm4
397 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
398 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
399 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
400 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[2,1,2,3]
401 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
402 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
403 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
404 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
405 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
406 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
407 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
408 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
409 ; SSE-NEXT: packuswb %xmm2, %xmm2
410 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,3,2,3]
411 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,0,3,2,4,5,6,7]
412 ; SSE-NEXT: packuswb %xmm5, %xmm5
413 ; SSE-NEXT: packuswb %xmm3, %xmm3
414 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
415 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
416 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,0,2,3]
417 ; SSE-NEXT: packuswb %xmm6, %xmm6
418 ; SSE-NEXT: packuswb %xmm4, %xmm4
419 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
420 ; SSE-NEXT: movdqa %xmm10, %xmm7
421 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
422 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
423 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
424 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
425 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm10[2,1,2,3]
426 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
427 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
428 ; SSE-NEXT: packuswb %xmm8, %xmm8
429 ; SSE-NEXT: por %xmm0, %xmm11
430 ; SSE-NEXT: movdqa %xmm11, %xmm0
431 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
432 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
433 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
434 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,2,1]
435 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
436 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
437 ; SSE-NEXT: packuswb %xmm0, %xmm0
438 ; SSE-NEXT: movd %xmm2, (%rsi)
439 ; SSE-NEXT: movd %xmm5, (%rdx)
440 ; SSE-NEXT: movd %xmm3, (%rcx)
441 ; SSE-NEXT: movd %xmm6, (%r8)
442 ; SSE-NEXT: movd %xmm4, (%r9)
443 ; SSE-NEXT: movd %xmm8, (%rdi)
444 ; SSE-NEXT: movd %xmm0, (%rax)
447 ; AVX-LABEL: load_i8_stride7_vf4:
449 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
450 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
451 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
452 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
453 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
454 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
455 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
456 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
457 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
458 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
459 ; AVX-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
460 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm5
461 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
462 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
463 ; AVX-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
464 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7
465 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
466 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
467 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm4
468 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
469 ; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm9
470 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
471 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm6
472 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
473 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
474 ; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
475 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
476 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
477 ; AVX-NEXT: vmovd %xmm2, (%rsi)
478 ; AVX-NEXT: vmovd %xmm3, (%rdx)
479 ; AVX-NEXT: vmovd %xmm5, (%rcx)
480 ; AVX-NEXT: vmovd %xmm7, (%r8)
481 ; AVX-NEXT: vmovd %xmm4, (%r9)
482 ; AVX-NEXT: vmovd %xmm6, (%r10)
483 ; AVX-NEXT: vmovd %xmm0, (%rax)
486 ; AVX2-LABEL: load_i8_stride7_vf4:
488 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
489 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
490 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
491 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
492 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
493 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
494 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
495 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
496 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
497 ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
498 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
499 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm5
500 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
501 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
502 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
503 ; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm7
504 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
505 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
506 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm4
507 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
508 ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm9
509 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
510 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6
511 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
512 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
513 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
514 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
515 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
516 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
517 ; AVX2-NEXT: vmovd %xmm3, (%rdx)
518 ; AVX2-NEXT: vmovd %xmm5, (%rcx)
519 ; AVX2-NEXT: vmovd %xmm7, (%r8)
520 ; AVX2-NEXT: vmovd %xmm4, (%r9)
521 ; AVX2-NEXT: vmovd %xmm6, (%r10)
522 ; AVX2-NEXT: vmovd %xmm0, (%rax)
525 ; AVX2-FP-LABEL: load_i8_stride7_vf4:
527 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
528 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
529 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
530 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
531 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
532 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
533 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
534 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
535 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
536 ; AVX2-FP-NEXT: vpor %xmm3, %xmm4, %xmm3
537 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
538 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
539 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
540 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
541 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
542 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
543 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
544 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
545 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
546 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
547 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
548 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
549 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
550 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
551 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
552 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
553 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
554 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
555 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
556 ; AVX2-FP-NEXT: vmovd %xmm3, (%rdx)
557 ; AVX2-FP-NEXT: vmovd %xmm5, (%rcx)
558 ; AVX2-FP-NEXT: vmovd %xmm7, (%r8)
559 ; AVX2-FP-NEXT: vmovd %xmm4, (%r9)
560 ; AVX2-FP-NEXT: vmovd %xmm6, (%r10)
561 ; AVX2-FP-NEXT: vmovd %xmm0, (%rax)
564 ; AVX2-FCP-LABEL: load_i8_stride7_vf4:
566 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
567 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
568 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
569 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
570 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
571 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
572 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
573 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
574 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
575 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
576 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
577 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
578 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
579 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
580 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
581 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
582 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
583 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
584 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
585 ; AVX2-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
586 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
587 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
588 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
589 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
590 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
591 ; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
592 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
593 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
594 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
595 ; AVX2-FCP-NEXT: vmovd %xmm3, (%rdx)
596 ; AVX2-FCP-NEXT: vmovd %xmm5, (%rcx)
597 ; AVX2-FCP-NEXT: vmovd %xmm7, (%r8)
598 ; AVX2-FCP-NEXT: vmovd %xmm4, (%r9)
599 ; AVX2-FCP-NEXT: vmovd %xmm6, (%r10)
600 ; AVX2-FCP-NEXT: vmovd %xmm0, (%rax)
601 ; AVX2-FCP-NEXT: retq
603 ; AVX512-LABEL: load_i8_stride7_vf4:
605 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
606 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
607 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
608 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
609 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
610 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
611 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
612 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
613 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
614 ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
615 ; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
616 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5
617 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
618 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
619 ; AVX512-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
620 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7
621 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
622 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
623 ; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4
624 ; AVX512-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
625 ; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9
626 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
627 ; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
628 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
629 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
630 ; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm1
631 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
632 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
633 ; AVX512-NEXT: vmovd %xmm2, (%rsi)
634 ; AVX512-NEXT: vmovd %xmm3, (%rdx)
635 ; AVX512-NEXT: vmovd %xmm5, (%rcx)
636 ; AVX512-NEXT: vmovd %xmm7, (%r8)
637 ; AVX512-NEXT: vmovd %xmm4, (%r9)
638 ; AVX512-NEXT: vmovd %xmm6, (%r10)
639 ; AVX512-NEXT: vmovd %xmm0, (%rax)
642 ; AVX512-FCP-LABEL: load_i8_stride7_vf4:
643 ; AVX512-FCP: # %bb.0:
644 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
645 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
646 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
647 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
648 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
649 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
650 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
651 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
652 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
653 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
654 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
655 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
656 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
657 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
658 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
659 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
660 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
661 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
662 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
663 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
664 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
665 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
666 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
667 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
668 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
669 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
670 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
671 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
672 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi)
673 ; AVX512-FCP-NEXT: vmovd %xmm3, (%rdx)
674 ; AVX512-FCP-NEXT: vmovd %xmm5, (%rcx)
675 ; AVX512-FCP-NEXT: vmovd %xmm7, (%r8)
676 ; AVX512-FCP-NEXT: vmovd %xmm4, (%r9)
677 ; AVX512-FCP-NEXT: vmovd %xmm6, (%r10)
678 ; AVX512-FCP-NEXT: vmovd %xmm0, (%rax)
679 ; AVX512-FCP-NEXT: retq
681 ; AVX512DQ-LABEL: load_i8_stride7_vf4:
683 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
684 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
685 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
686 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
687 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
688 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
689 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
690 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
691 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
692 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
693 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
694 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5
695 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
696 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
697 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
698 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm7
699 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
700 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
701 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm4
702 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
703 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm9
704 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
705 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm6
706 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
707 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
708 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm1
709 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
710 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
711 ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi)
712 ; AVX512DQ-NEXT: vmovd %xmm3, (%rdx)
713 ; AVX512DQ-NEXT: vmovd %xmm5, (%rcx)
714 ; AVX512DQ-NEXT: vmovd %xmm7, (%r8)
715 ; AVX512DQ-NEXT: vmovd %xmm4, (%r9)
716 ; AVX512DQ-NEXT: vmovd %xmm6, (%r10)
717 ; AVX512DQ-NEXT: vmovd %xmm0, (%rax)
718 ; AVX512DQ-NEXT: retq
720 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf4:
721 ; AVX512DQ-FCP: # %bb.0:
722 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
723 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
724 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
725 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
726 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
727 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
728 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
729 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
730 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
731 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
732 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
733 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
734 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
735 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
736 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
737 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
738 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
739 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
740 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
741 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
742 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
743 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
744 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
745 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
746 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
747 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
748 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
749 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
750 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi)
751 ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rdx)
752 ; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%rcx)
753 ; AVX512DQ-FCP-NEXT: vmovd %xmm7, (%r8)
754 ; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%r9)
755 ; AVX512DQ-FCP-NEXT: vmovd %xmm6, (%r10)
756 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax)
757 ; AVX512DQ-FCP-NEXT: retq
759 ; AVX512BW-LABEL: load_i8_stride7_vf4:
761 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
762 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
763 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
764 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
765 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
766 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
767 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
768 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
769 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
770 ; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
771 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
772 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5
773 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
774 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
775 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
776 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
777 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
778 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
779 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4
780 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
781 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
782 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
783 ; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
784 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
785 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
786 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
787 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
788 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
789 ; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
790 ; AVX512BW-NEXT: vmovd %xmm3, (%rdx)
791 ; AVX512BW-NEXT: vmovd %xmm5, (%rcx)
792 ; AVX512BW-NEXT: vmovd %xmm7, (%r8)
793 ; AVX512BW-NEXT: vmovd %xmm4, (%r9)
794 ; AVX512BW-NEXT: vmovd %xmm6, (%r10)
795 ; AVX512BW-NEXT: vmovd %xmm0, (%rax)
796 ; AVX512BW-NEXT: retq
798 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf4:
799 ; AVX512BW-FCP: # %bb.0:
800 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
801 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
802 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
803 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
804 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
805 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
806 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
807 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
808 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
809 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
810 ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
811 ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
812 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
813 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
814 ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
815 ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
816 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
817 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
818 ; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
819 ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
820 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
821 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
822 ; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
823 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
824 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
825 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
826 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
827 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
828 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi)
829 ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rdx)
830 ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%rcx)
831 ; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r8)
832 ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%r9)
833 ; AVX512BW-FCP-NEXT: vmovd %xmm6, (%r10)
834 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%rax)
835 ; AVX512BW-FCP-NEXT: retq
837 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf4:
838 ; AVX512DQ-BW: # %bb.0:
839 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
840 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
841 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
842 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
843 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
844 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
845 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
846 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
847 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
848 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
849 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
850 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm0, %xmm5
851 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
852 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
853 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
854 ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
855 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
856 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
857 ; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm1, %xmm4
858 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
859 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
860 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
861 ; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
862 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
863 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
864 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
865 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
866 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
867 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi)
868 ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rdx)
869 ; AVX512DQ-BW-NEXT: vmovd %xmm5, (%rcx)
870 ; AVX512DQ-BW-NEXT: vmovd %xmm7, (%r8)
871 ; AVX512DQ-BW-NEXT: vmovd %xmm4, (%r9)
872 ; AVX512DQ-BW-NEXT: vmovd %xmm6, (%r10)
873 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax)
874 ; AVX512DQ-BW-NEXT: retq
876 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf4:
877 ; AVX512DQ-BW-FCP: # %bb.0:
878 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
879 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
880 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
881 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
882 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
883 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,7,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
884 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
885 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
886 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
887 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
888 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
889 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
890 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
891 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
892 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
893 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm7
894 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
895 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
896 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm4
897 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
898 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm9
899 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
900 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
901 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
902 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
903 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
904 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
905 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
906 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi)
907 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rdx)
908 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%rcx)
909 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r8)
910 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%r9)
911 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%r10)
912 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%rax)
913 ; AVX512DQ-BW-FCP-NEXT: retq
914 %wide.vec = load <28 x i8>, ptr %in.vec, align 64
915 %strided.vec0 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
916 %strided.vec1 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
917 %strided.vec2 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
918 %strided.vec3 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
919 %strided.vec4 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
920 %strided.vec5 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
921 %strided.vec6 = shufflevector <28 x i8> %wide.vec, <28 x i8> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
922 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
923 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
924 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
925 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
926 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
927 store <4 x i8> %strided.vec5, ptr %out.vec5, align 64
928 store <4 x i8> %strided.vec6, ptr %out.vec6, align 64
932 define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
933 ; SSE-LABEL: load_i8_stride7_vf8:
935 ; SSE-NEXT: movdqa (%rdi), %xmm3
936 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
937 ; SSE-NEXT: movdqa 32(%rdi), %xmm6
938 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
939 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
940 ; SSE-NEXT: movdqa %xmm3, %xmm1
941 ; SSE-NEXT: pand %xmm2, %xmm1
942 ; SSE-NEXT: pandn %xmm4, %xmm2
943 ; SSE-NEXT: movdqa %xmm4, %xmm11
944 ; SSE-NEXT: por %xmm1, %xmm2
945 ; SSE-NEXT: pxor %xmm1, %xmm1
946 ; SSE-NEXT: movdqa %xmm2, %xmm5
947 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
948 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
949 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
950 ; SSE-NEXT: pxor %xmm4, %xmm4
951 ; SSE-NEXT: pand %xmm7, %xmm2
952 ; SSE-NEXT: pandn %xmm5, %xmm7
953 ; SSE-NEXT: por %xmm2, %xmm7
954 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3]
955 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
956 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
957 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[0,3,2,1,4,5,6,7]
958 ; SSE-NEXT: packuswb %xmm7, %xmm7
959 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
960 ; SSE-NEXT: pand %xmm1, %xmm7
961 ; SSE-NEXT: movdqa %xmm1, %xmm2
962 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [0,65535,65535,65535,65535,65535,65535,65535]
963 ; SSE-NEXT: movdqa %xmm6, %xmm5
964 ; SSE-NEXT: pand %xmm9, %xmm5
965 ; SSE-NEXT: pandn %xmm0, %xmm9
966 ; SSE-NEXT: por %xmm5, %xmm9
967 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
968 ; SSE-NEXT: movdqa %xmm6, %xmm8
969 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
970 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
971 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
972 ; SSE-NEXT: movdqa %xmm11, %xmm10
973 ; SSE-NEXT: movdqa %xmm11, %xmm1
974 ; SSE-NEXT: pand %xmm5, %xmm10
975 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,0,65535,65535,65535,65535,65535]
976 ; SSE-NEXT: movdqa %xmm6, %xmm4
977 ; SSE-NEXT: pand %xmm12, %xmm4
978 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
979 ; SSE-NEXT: pandn %xmm0, %xmm12
980 ; SSE-NEXT: movaps %xmm0, %xmm14
981 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm6[0,0]
982 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm6[2,3]
983 ; SSE-NEXT: pand %xmm5, %xmm0
984 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
985 ; SSE-NEXT: pandn %xmm6, %xmm5
986 ; SSE-NEXT: movdqa %xmm6, %xmm15
987 ; SSE-NEXT: pxor %xmm0, %xmm0
988 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
989 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
990 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,6,7]
991 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
992 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,4,4,5,6]
993 ; SSE-NEXT: packuswb %xmm9, %xmm9
994 ; SSE-NEXT: movdqa %xmm2, %xmm11
995 ; SSE-NEXT: movdqa %xmm2, %xmm13
996 ; SSE-NEXT: pandn %xmm9, %xmm13
997 ; SSE-NEXT: por %xmm7, %xmm13
998 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
999 ; SSE-NEXT: movdqa %xmm7, %xmm9
1000 ; SSE-NEXT: movdqa %xmm1, %xmm4
1001 ; SSE-NEXT: pandn %xmm1, %xmm9
1002 ; SSE-NEXT: movdqa %xmm3, %xmm2
1003 ; SSE-NEXT: pand %xmm7, %xmm3
1004 ; SSE-NEXT: por %xmm9, %xmm3
1005 ; SSE-NEXT: movdqa %xmm3, %xmm9
1006 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1007 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,0,65535]
1008 ; SSE-NEXT: movdqa %xmm0, %xmm1
1009 ; SSE-NEXT: pandn %xmm9, %xmm1
1010 ; SSE-NEXT: pxor %xmm6, %xmm6
1011 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
1012 ; SSE-NEXT: pand %xmm0, %xmm3
1013 ; SSE-NEXT: por %xmm1, %xmm3
1014 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
1015 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
1016 ; SSE-NEXT: movdqa %xmm8, %xmm9
1017 ; SSE-NEXT: pand %xmm1, %xmm9
1018 ; SSE-NEXT: pandn %xmm15, %xmm1
1019 ; SSE-NEXT: por %xmm9, %xmm1
1020 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1021 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
1022 ; SSE-NEXT: packuswb %xmm1, %xmm1
1023 ; SSE-NEXT: movdqa %xmm11, %xmm9
1024 ; SSE-NEXT: pandn %xmm1, %xmm9
1025 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
1026 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1027 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
1028 ; SSE-NEXT: packuswb %xmm1, %xmm1
1029 ; SSE-NEXT: pand %xmm11, %xmm1
1030 ; SSE-NEXT: por %xmm1, %xmm9
1031 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1032 ; SSE-NEXT: pandn %xmm2, %xmm1
1033 ; SSE-NEXT: por %xmm1, %xmm10
1034 ; SSE-NEXT: movdqa %xmm10, %xmm1
1035 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
1036 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
1037 ; SSE-NEXT: pand %xmm0, %xmm10
1038 ; SSE-NEXT: pandn %xmm1, %xmm0
1039 ; SSE-NEXT: por %xmm10, %xmm0
1040 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1041 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1042 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
1043 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
1044 ; SSE-NEXT: packuswb %xmm0, %xmm0
1045 ; SSE-NEXT: pand %xmm11, %xmm0
1046 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
1047 ; SSE-NEXT: pand %xmm1, %xmm8
1048 ; SSE-NEXT: pandn %xmm15, %xmm1
1049 ; SSE-NEXT: por %xmm8, %xmm1
1050 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1051 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
1052 ; SSE-NEXT: packuswb %xmm1, %xmm1
1053 ; SSE-NEXT: movdqa %xmm11, %xmm8
1054 ; SSE-NEXT: pandn %xmm1, %xmm8
1055 ; SSE-NEXT: por %xmm0, %xmm8
1056 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,0,65535,65535]
1057 ; SSE-NEXT: movdqa %xmm4, %xmm1
1058 ; SSE-NEXT: pand %xmm0, %xmm1
1059 ; SSE-NEXT: pandn %xmm2, %xmm0
1060 ; SSE-NEXT: movdqa %xmm2, %xmm10
1061 ; SSE-NEXT: por %xmm1, %xmm0
1062 ; SSE-NEXT: movdqa %xmm0, %xmm1
1063 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
1064 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
1065 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,65535,65535,65535,65535]
1066 ; SSE-NEXT: pand %xmm3, %xmm0
1067 ; SSE-NEXT: pandn %xmm1, %xmm3
1068 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1069 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1070 ; SSE-NEXT: por %xmm0, %xmm3
1071 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,2,1,0,4,5,6,7]
1072 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1073 ; SSE-NEXT: packuswb %xmm0, %xmm0
1074 ; SSE-NEXT: pand %xmm11, %xmm0
1075 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1076 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
1077 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,5,6,7]
1078 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
1079 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[0,3,2,3,4,5,6,7]
1080 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
1081 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
1082 ; SSE-NEXT: packuswb %xmm3, %xmm3
1083 ; SSE-NEXT: pandn %xmm3, %xmm11
1084 ; SSE-NEXT: por %xmm0, %xmm11
1085 ; SSE-NEXT: movdqa %xmm11, %xmm6
1086 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
1087 ; SSE-NEXT: movdqa %xmm4, %xmm2
1088 ; SSE-NEXT: movdqa %xmm4, %xmm3
1089 ; SSE-NEXT: pand %xmm0, %xmm3
1090 ; SSE-NEXT: movdqa %xmm10, %xmm11
1091 ; SSE-NEXT: pandn %xmm10, %xmm0
1092 ; SSE-NEXT: por %xmm3, %xmm0
1093 ; SSE-NEXT: movdqa %xmm0, %xmm3
1094 ; SSE-NEXT: pxor %xmm4, %xmm4
1095 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1096 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1097 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1098 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1099 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1100 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1101 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1102 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,5]
1103 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,0,65535]
1104 ; SSE-NEXT: pand %xmm3, %xmm1
1105 ; SSE-NEXT: pandn %xmm15, %xmm3
1106 ; SSE-NEXT: por %xmm1, %xmm3
1107 ; SSE-NEXT: packuswb %xmm3, %xmm0
1108 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,3,2,3]
1109 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,3,2,3]
1110 ; SSE-NEXT: movdqa %xmm11, %xmm3
1111 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
1112 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1113 ; SSE-NEXT: movdqa %xmm1, %xmm0
1114 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1115 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1116 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1117 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1118 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1119 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
1120 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1121 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
1122 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3],xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
1123 ; SSE-NEXT: pand %xmm0, %xmm14
1124 ; SSE-NEXT: pandn %xmm15, %xmm0
1125 ; SSE-NEXT: por %xmm14, %xmm0
1126 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1127 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
1128 ; SSE-NEXT: packuswb %xmm0, %xmm1
1129 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,3,2,3]
1130 ; SSE-NEXT: movdqa %xmm2, %xmm0
1131 ; SSE-NEXT: pand %xmm7, %xmm0
1132 ; SSE-NEXT: pandn %xmm3, %xmm7
1133 ; SSE-NEXT: por %xmm0, %xmm7
1134 ; SSE-NEXT: movdqa %xmm7, %xmm0
1135 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1136 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
1137 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1138 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
1139 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
1140 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1141 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1142 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1143 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
1144 ; SSE-NEXT: pand %xmm1, %xmm5
1145 ; SSE-NEXT: pandn %xmm15, %xmm1
1146 ; SSE-NEXT: por %xmm5, %xmm1
1147 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1148 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
1149 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1150 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1151 ; SSE-NEXT: packuswb %xmm1, %xmm0
1152 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1153 ; SSE-NEXT: movq %xmm13, (%rsi)
1154 ; SSE-NEXT: movq %xmm9, (%rdx)
1155 ; SSE-NEXT: movq %xmm8, (%rcx)
1156 ; SSE-NEXT: movq %xmm6, (%r8)
1157 ; SSE-NEXT: movq %xmm10, (%r9)
1158 ; SSE-NEXT: movq %xmm11, (%rdi)
1159 ; SSE-NEXT: movq %xmm0, (%rax)
1162 ; AVX-LABEL: load_i8_stride7_vf8:
1164 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1165 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1166 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1167 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1168 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
1169 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
1170 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[5,12,u,u,u,u,u,u,u,u,u,u,u]
1171 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1172 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
1173 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2]
1174 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
1175 ; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1176 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [255,255,255,255,255,0,0,0,255,255,255,255,255,0,0,0]
1177 ; AVX-NEXT: # xmm7 = mem[0,0]
1178 ; AVX-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4
1179 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u]
1180 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1181 ; AVX-NEXT: vpor %xmm5, %xmm8, %xmm5
1182 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm2[4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3]
1183 ; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm8
1184 ; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm8, %xmm5
1185 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1186 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
1187 ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
1188 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
1189 ; AVX-NEXT: vpshufb %xmm6, %xmm9, %xmm9
1190 ; AVX-NEXT: vpblendvb %xmm7, %xmm8, %xmm9, %xmm8
1191 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
1192 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
1193 ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
1194 ; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm2[6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5]
1195 ; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6
1196 ; AVX-NEXT: vpblendvb %xmm7, %xmm9, %xmm6, %xmm6
1197 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
1198 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
1199 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
1200 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
1201 ; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm10
1202 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1203 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1204 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3],xmm10[4,5,6,7]
1205 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,u,u,u,u,u,u,u,u]
1206 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,1,8,15],zero,xmm2[u,u,u,u,u,u,u,u]
1207 ; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10
1208 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1209 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1210 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
1211 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
1212 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1213 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1214 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1215 ; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1
1216 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
1217 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1218 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
1219 ; AVX-NEXT: vmovq %xmm4, (%rsi)
1220 ; AVX-NEXT: vmovq %xmm5, (%rdx)
1221 ; AVX-NEXT: vmovq %xmm8, (%rcx)
1222 ; AVX-NEXT: vmovq %xmm6, (%r8)
1223 ; AVX-NEXT: vmovq %xmm7, (%r9)
1224 ; AVX-NEXT: vmovq %xmm10, (%r10)
1225 ; AVX-NEXT: vmovq %xmm0, (%rax)
1228 ; AVX2-LABEL: load_i8_stride7_vf8:
1230 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1231 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1232 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1233 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
1234 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1235 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1236 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1237 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1238 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1239 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
1240 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1241 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
1242 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1243 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1244 ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3
1245 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1246 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1247 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
1248 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1249 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1250 ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
1251 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1252 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
1253 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1254 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1255 ; AVX2-NEXT: vpor %xmm6, %xmm5, %xmm5
1256 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1257 ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1258 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
1259 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1260 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1261 ; AVX2-NEXT: vpor %xmm7, %xmm6, %xmm6
1262 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1263 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
1264 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1265 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1266 ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
1267 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1268 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1269 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
1270 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1271 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1272 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1273 ; AVX2-NEXT: vmovq %xmm2, (%rsi)
1274 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
1275 ; AVX2-NEXT: vmovq %xmm4, (%rcx)
1276 ; AVX2-NEXT: vmovq %xmm5, (%r8)
1277 ; AVX2-NEXT: vmovq %xmm6, (%r9)
1278 ; AVX2-NEXT: vmovq %xmm7, (%r10)
1279 ; AVX2-NEXT: vmovq %xmm0, (%rax)
1280 ; AVX2-NEXT: vzeroupper
1283 ; AVX2-FP-LABEL: load_i8_stride7_vf8:
1285 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1286 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1287 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
1288 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
1289 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1290 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1291 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1292 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1293 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1294 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
1295 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1296 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
1297 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1298 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1299 ; AVX2-FP-NEXT: vpor %xmm4, %xmm3, %xmm3
1300 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1301 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1302 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
1303 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1304 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1305 ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
1306 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1307 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
1308 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1309 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1310 ; AVX2-FP-NEXT: vpor %xmm6, %xmm5, %xmm5
1311 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1312 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1313 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
1314 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1315 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1316 ; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6
1317 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1318 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
1319 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1320 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1321 ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7
1322 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1323 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1324 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
1325 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1326 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1327 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1328 ; AVX2-FP-NEXT: vmovq %xmm2, (%rsi)
1329 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
1330 ; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
1331 ; AVX2-FP-NEXT: vmovq %xmm5, (%r8)
1332 ; AVX2-FP-NEXT: vmovq %xmm6, (%r9)
1333 ; AVX2-FP-NEXT: vmovq %xmm7, (%r10)
1334 ; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
1335 ; AVX2-FP-NEXT: vzeroupper
1336 ; AVX2-FP-NEXT: retq
1338 ; AVX2-FCP-LABEL: load_i8_stride7_vf8:
1339 ; AVX2-FCP: # %bb.0:
1340 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1341 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1342 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
1343 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1344 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
1345 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
1346 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1347 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1348 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1349 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1350 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1351 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1352 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1353 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1354 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
1355 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
1356 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
1357 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1358 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1359 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1360 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
1361 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1362 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
1363 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1364 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1365 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1366 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0,0]
1367 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
1368 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1369 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1370 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1371 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1372 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1373 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1374 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1375 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1376 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1377 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,0]
1378 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1379 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1380 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1381 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1382 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1383 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rsi)
1384 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
1385 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
1386 ; AVX2-FCP-NEXT: vmovq %xmm5, (%r8)
1387 ; AVX2-FCP-NEXT: vmovq %xmm6, (%r9)
1388 ; AVX2-FCP-NEXT: vmovq %xmm7, (%r10)
1389 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
1390 ; AVX2-FCP-NEXT: vzeroupper
1391 ; AVX2-FCP-NEXT: retq
1393 ; AVX512-LABEL: load_i8_stride7_vf8:
1395 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1396 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1397 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
1398 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
1399 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1400 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2
1401 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
1402 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1403 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1404 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
1405 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1406 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
1407 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1408 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1409 ; AVX512-NEXT: vpor %xmm4, %xmm3, %xmm3
1410 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1411 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1412 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
1413 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1414 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1415 ; AVX512-NEXT: vpor %xmm5, %xmm4, %xmm4
1416 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1417 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
1418 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1419 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1420 ; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5
1421 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1422 ; AVX512-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm6
1423 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
1424 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1425 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1426 ; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6
1427 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1428 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
1429 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1430 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1431 ; AVX512-NEXT: vpor %xmm7, %xmm8, %xmm7
1432 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1433 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
1434 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1435 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1436 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1437 ; AVX512-NEXT: vmovq %xmm2, (%rsi)
1438 ; AVX512-NEXT: vmovq %xmm3, (%rdx)
1439 ; AVX512-NEXT: vmovq %xmm4, (%rcx)
1440 ; AVX512-NEXT: vmovq %xmm5, (%r8)
1441 ; AVX512-NEXT: vmovq %xmm6, (%r9)
1442 ; AVX512-NEXT: vmovq %xmm7, (%r10)
1443 ; AVX512-NEXT: vmovq %xmm0, (%rax)
1444 ; AVX512-NEXT: vzeroupper
1447 ; AVX512-FCP-LABEL: load_i8_stride7_vf8:
1448 ; AVX512-FCP: # %bb.0:
1449 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1450 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1451 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
1452 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1453 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1454 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2
1455 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1456 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1457 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1458 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1459 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1460 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1461 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1462 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1463 ; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
1464 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1465 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1466 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1467 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1468 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1469 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
1470 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1471 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
1472 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1473 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1474 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1475 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1476 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm6
1477 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1478 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1479 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1480 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1481 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1482 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1483 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1484 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1485 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1486 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1487 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1488 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1489 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1490 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1491 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
1492 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
1493 ; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
1494 ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
1495 ; AVX512-FCP-NEXT: vmovq %xmm6, (%r9)
1496 ; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
1497 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
1498 ; AVX512-FCP-NEXT: vzeroupper
1499 ; AVX512-FCP-NEXT: retq
1501 ; AVX512DQ-LABEL: load_i8_stride7_vf8:
1502 ; AVX512DQ: # %bb.0:
1503 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1504 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1505 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
1506 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
1507 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1508 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2
1509 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
1510 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1511 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1512 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
1513 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1514 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
1515 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1516 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1517 ; AVX512DQ-NEXT: vpor %xmm4, %xmm3, %xmm3
1518 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1519 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1520 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
1521 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1522 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1523 ; AVX512DQ-NEXT: vpor %xmm5, %xmm4, %xmm4
1524 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1525 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
1526 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1527 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1528 ; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5
1529 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1530 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm6
1531 ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
1532 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1533 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1534 ; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6
1535 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1536 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
1537 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1538 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1539 ; AVX512DQ-NEXT: vpor %xmm7, %xmm8, %xmm7
1540 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1541 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
1542 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1543 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1544 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1545 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
1546 ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
1547 ; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
1548 ; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
1549 ; AVX512DQ-NEXT: vmovq %xmm6, (%r9)
1550 ; AVX512DQ-NEXT: vmovq %xmm7, (%r10)
1551 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
1552 ; AVX512DQ-NEXT: vzeroupper
1553 ; AVX512DQ-NEXT: retq
1555 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf8:
1556 ; AVX512DQ-FCP: # %bb.0:
1557 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1558 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1559 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
1560 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
1561 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
1562 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm2
1563 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1564 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1565 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1566 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1567 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5],ymm0[6,7,8],ymm1[9,10],ymm0[11,12],ymm1[13],ymm0[14,15]
1568 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1569 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1570 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1571 ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
1572 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,65535,65535,65535]
1573 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm0, %ymm4
1574 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1575 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1576 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1577 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
1578 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7]
1579 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
1580 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1581 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1582 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1583 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
1584 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm1, %ymm6
1585 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1586 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1587 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1588 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1589 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7,8],ymm0[9,10],ymm1[11,12],ymm0[13,14],ymm1[15]
1590 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1591 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1592 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1593 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1594 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
1595 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1596 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1597 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1598 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1599 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
1600 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
1601 ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
1602 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
1603 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9)
1604 ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
1605 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
1606 ; AVX512DQ-FCP-NEXT: vzeroupper
1607 ; AVX512DQ-FCP-NEXT: retq
1609 ; AVX512BW-LABEL: load_i8_stride7_vf8:
1610 ; AVX512BW: # %bb.0:
1611 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1612 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1613 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
1614 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0
1615 ; AVX512BW-NEXT: movw $290, %di # imm = 0x122
1616 ; AVX512BW-NEXT: kmovd %edi, %k1
1617 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1618 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
1619 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1620 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1621 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
1622 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1623 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
1624 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1625 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1626 ; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
1627 ; AVX512BW-NEXT: movw $580, %di # imm = 0x244
1628 ; AVX512BW-NEXT: kmovd %edi, %k1
1629 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1630 ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm5
1631 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1632 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1633 ; AVX512BW-NEXT: vpor %xmm5, %xmm4, %xmm4
1634 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1635 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
1636 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1637 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1638 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
1639 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224
1640 ; AVX512BW-NEXT: kmovd %edi, %k1
1641 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1642 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
1643 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1644 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1645 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
1646 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1647 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8
1648 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1649 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1650 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
1651 ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448
1652 ; AVX512BW-NEXT: kmovd %edi, %k1
1653 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
1654 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1655 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1656 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1657 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1658 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
1659 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
1660 ; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
1661 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
1662 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
1663 ; AVX512BW-NEXT: vmovq %xmm7, (%r10)
1664 ; AVX512BW-NEXT: vmovq %xmm0, (%rax)
1665 ; AVX512BW-NEXT: vzeroupper
1666 ; AVX512BW-NEXT: retq
1668 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf8:
1669 ; AVX512BW-FCP: # %bb.0:
1670 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1671 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1672 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
1673 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
1674 ; AVX512BW-FCP-NEXT: movw $290, %di # imm = 0x122
1675 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
1676 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1677 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1678 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1679 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1680 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1681 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1682 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1683 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1684 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1685 ; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
1686 ; AVX512BW-FCP-NEXT: movw $580, %di # imm = 0x244
1687 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
1688 ; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1689 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1690 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1691 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1692 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
1693 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1694 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
1695 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1696 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1697 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1698 ; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
1699 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
1700 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1701 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1702 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1703 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1704 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1705 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1706 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1707 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1708 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1709 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1710 ; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
1711 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
1712 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
1713 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1714 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1715 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1716 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1717 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
1718 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
1719 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
1720 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
1721 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9)
1722 ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
1723 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rax)
1724 ; AVX512BW-FCP-NEXT: vzeroupper
1725 ; AVX512BW-FCP-NEXT: retq
1727 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf8:
1728 ; AVX512DQ-BW: # %bb.0:
1729 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1730 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1731 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
1732 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm0
1733 ; AVX512DQ-BW-NEXT: movw $290, %di # imm = 0x122
1734 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
1735 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1736 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
1737 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1738 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1739 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
1740 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1741 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
1742 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1743 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1744 ; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
1745 ; AVX512DQ-BW-NEXT: movw $580, %di # imm = 0x244
1746 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
1747 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1748 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm5
1749 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1750 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1751 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm4, %xmm4
1752 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1753 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
1754 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1755 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1756 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
1757 ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224
1758 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
1759 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1760 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
1761 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1762 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1763 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
1764 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1765 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8
1766 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1767 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1768 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
1769 ; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448
1770 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
1771 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
1772 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm1
1773 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1774 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1775 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1776 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
1777 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
1778 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
1779 ; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8)
1780 ; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9)
1781 ; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10)
1782 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rax)
1783 ; AVX512DQ-BW-NEXT: vzeroupper
1784 ; AVX512DQ-BW-NEXT: retq
1786 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf8:
1787 ; AVX512DQ-BW-FCP: # %bb.0:
1788 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1789 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1790 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
1791 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
1792 ; AVX512DQ-BW-FCP-NEXT: movw $290, %di # imm = 0x122
1793 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
1794 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm2 {%k1}
1795 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
1796 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
1797 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,xmm2[u,u,u,u,u,u,u,u]
1798 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
1799 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5],ymm1[6,7,8],ymm0[9,10],ymm1[11,12],ymm0[13],ymm1[14,15]
1800 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1801 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
1802 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
1803 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
1804 ; AVX512DQ-BW-FCP-NEXT: movw $580, %di # imm = 0x244
1805 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
1806 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k1}
1807 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1808 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
1809 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,9],zero,zero,zero,xmm4[5,12],zero,xmm4[u,u,u,u,u,u,u,u]
1810 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
1811 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7]
1812 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
1813 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
1814 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
1815 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
1816 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
1817 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
1818 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm0, %ymm6 {%k1}
1819 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
1820 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
1821 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,xmm6[u,u,u,u,u,u,u,u]
1822 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
1823 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3,4],ymm1[5,6],ymm0[7,8],ymm1[9,10],ymm0[11,12],ymm1[13,14],ymm0[15]
1824 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1825 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
1826 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
1827 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
1828 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
1829 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
1830 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
1831 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
1832 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
1833 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,13,2,9,u,u,u,u,u,u,u,u,u,u,u,u]
1834 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1835 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
1836 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
1837 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
1838 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
1839 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9)
1840 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
1841 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rax)
1842 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1843 ; AVX512DQ-BW-FCP-NEXT: retq
1844 %wide.vec = load <56 x i8>, ptr %in.vec, align 64
1845 %strided.vec0 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
1846 %strided.vec1 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
1847 %strided.vec2 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
1848 %strided.vec3 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
1849 %strided.vec4 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
1850 %strided.vec5 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
1851 %strided.vec6 = shufflevector <56 x i8> %wide.vec, <56 x i8> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
1852 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
1853 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
1854 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
1855 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
1856 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
1857 store <8 x i8> %strided.vec5, ptr %out.vec5, align 64
1858 store <8 x i8> %strided.vec6, ptr %out.vec6, align 64
1862 define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1863 ; SSE-LABEL: load_i8_stride7_vf16:
1865 ; SSE-NEXT: subq $168, %rsp
1866 ; SSE-NEXT: movdqa 96(%rdi), %xmm15
1867 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
1868 ; SSE-NEXT: movdqa 64(%rdi), %xmm7
1869 ; SSE-NEXT: movdqa (%rdi), %xmm6
1870 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
1871 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
1872 ; SSE-NEXT: movdqa 48(%rdi), %xmm8
1873 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
1874 ; SSE-NEXT: movdqa %xmm2, %xmm0
1875 ; SSE-NEXT: pandn %xmm1, %xmm0
1876 ; SSE-NEXT: movdqa %xmm1, %xmm12
1877 ; SSE-NEXT: movdqa %xmm8, %xmm1
1878 ; SSE-NEXT: pand %xmm2, %xmm1
1879 ; SSE-NEXT: por %xmm0, %xmm1
1880 ; SSE-NEXT: pxor %xmm13, %xmm13
1881 ; SSE-NEXT: movdqa %xmm1, %xmm0
1882 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1883 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
1884 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1885 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1886 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1887 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
1888 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1889 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1890 ; SSE-NEXT: packuswb %xmm0, %xmm1
1891 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
1892 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,65535,0,65535]
1893 ; SSE-NEXT: movdqa %xmm10, %xmm0
1894 ; SSE-NEXT: pandn %xmm3, %xmm0
1895 ; SSE-NEXT: movdqa %xmm3, %xmm9
1896 ; SSE-NEXT: movdqa %xmm6, %xmm3
1897 ; SSE-NEXT: movdqa %xmm6, %xmm11
1898 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1899 ; SSE-NEXT: pand %xmm10, %xmm3
1900 ; SSE-NEXT: por %xmm0, %xmm3
1901 ; SSE-NEXT: movdqa %xmm3, %xmm0
1902 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1903 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,0,65535,0,65535]
1904 ; SSE-NEXT: movdqa %xmm14, %xmm6
1905 ; SSE-NEXT: pandn %xmm0, %xmm6
1906 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3],xmm3[4],xmm13[4],xmm3[5],xmm13[5],xmm3[6],xmm13[6],xmm3[7],xmm13[7]
1907 ; SSE-NEXT: pand %xmm14, %xmm3
1908 ; SSE-NEXT: por %xmm6, %xmm3
1909 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,1,3]
1910 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1911 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,1]
1912 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1913 ; SSE-NEXT: packuswb %xmm0, %xmm0
1914 ; SSE-NEXT: pand %xmm2, %xmm0
1915 ; SSE-NEXT: pandn %xmm1, %xmm2
1916 ; SSE-NEXT: por %xmm2, %xmm0
1917 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,0,65535]
1918 ; SSE-NEXT: movdqa %xmm3, %xmm1
1919 ; SSE-NEXT: pandn %xmm7, %xmm1
1920 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1921 ; SSE-NEXT: movdqa %xmm4, %xmm2
1922 ; SSE-NEXT: movdqa %xmm4, %xmm5
1923 ; SSE-NEXT: pand %xmm3, %xmm2
1924 ; SSE-NEXT: movdqa %xmm3, %xmm13
1925 ; SSE-NEXT: por %xmm1, %xmm2
1926 ; SSE-NEXT: movdqa %xmm2, %xmm1
1927 ; SSE-NEXT: pxor %xmm6, %xmm6
1928 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
1929 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
1930 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
1931 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1932 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
1933 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1934 ; SSE-NEXT: movdqa %xmm15, %xmm2
1935 ; SSE-NEXT: movdqa %xmm15, %xmm3
1936 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
1937 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1938 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
1939 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1940 ; SSE-NEXT: pxor %xmm15, %xmm15
1941 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1942 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1943 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1944 ; SSE-NEXT: packuswb %xmm2, %xmm2
1945 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0]
1946 ; SSE-NEXT: movdqa %xmm4, %xmm3
1947 ; SSE-NEXT: pandn %xmm2, %xmm3
1948 ; SSE-NEXT: packuswb %xmm1, %xmm1
1949 ; SSE-NEXT: pand %xmm4, %xmm1
1950 ; SSE-NEXT: por %xmm1, %xmm3
1951 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
1952 ; SSE-NEXT: pand %xmm1, %xmm0
1953 ; SSE-NEXT: pandn %xmm3, %xmm1
1954 ; SSE-NEXT: por %xmm0, %xmm1
1955 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1956 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
1957 ; SSE-NEXT: movdqa %xmm2, %xmm0
1958 ; SSE-NEXT: pandn %xmm12, %xmm0
1959 ; SSE-NEXT: movdqa %xmm8, %xmm1
1960 ; SSE-NEXT: pand %xmm2, %xmm1
1961 ; SSE-NEXT: por %xmm0, %xmm1
1962 ; SSE-NEXT: movdqa %xmm1, %xmm0
1963 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
1964 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
1965 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
1966 ; SSE-NEXT: pand %xmm2, %xmm1
1967 ; SSE-NEXT: pandn %xmm0, %xmm2
1968 ; SSE-NEXT: por %xmm1, %xmm2
1969 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,2,1]
1970 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
1971 ; SSE-NEXT: psrld $16, %xmm0
1972 ; SSE-NEXT: packuswb %xmm0, %xmm1
1973 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
1974 ; SSE-NEXT: movdqa %xmm4, %xmm0
1975 ; SSE-NEXT: pandn %xmm1, %xmm0
1976 ; SSE-NEXT: movdqa %xmm13, %xmm1
1977 ; SSE-NEXT: pandn %xmm9, %xmm1
1978 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1979 ; SSE-NEXT: movdqa %xmm11, %xmm2
1980 ; SSE-NEXT: pand %xmm13, %xmm2
1981 ; SSE-NEXT: movdqa %xmm13, %xmm11
1982 ; SSE-NEXT: por %xmm1, %xmm2
1983 ; SSE-NEXT: movdqa %xmm2, %xmm1
1984 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
1985 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535]
1986 ; SSE-NEXT: movdqa %xmm6, %xmm3
1987 ; SSE-NEXT: pandn %xmm1, %xmm3
1988 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
1989 ; SSE-NEXT: pand %xmm6, %xmm2
1990 ; SSE-NEXT: por %xmm3, %xmm2
1991 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
1992 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
1993 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm1[0,1,2,3,5,5,5,5]
1994 ; SSE-NEXT: packuswb %xmm13, %xmm13
1995 ; SSE-NEXT: pand %xmm4, %xmm13
1996 ; SSE-NEXT: por %xmm0, %xmm13
1997 ; SSE-NEXT: movdqa %xmm10, %xmm0
1998 ; SSE-NEXT: pandn %xmm5, %xmm0
1999 ; SSE-NEXT: movdqa %xmm5, %xmm6
2000 ; SSE-NEXT: movdqa %xmm7, %xmm1
2001 ; SSE-NEXT: pand %xmm10, %xmm1
2002 ; SSE-NEXT: por %xmm0, %xmm1
2003 ; SSE-NEXT: movdqa %xmm1, %xmm0
2004 ; SSE-NEXT: pxor %xmm2, %xmm2
2005 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2006 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2007 ; SSE-NEXT: pand %xmm14, %xmm1
2008 ; SSE-NEXT: pandn %xmm0, %xmm14
2009 ; SSE-NEXT: por %xmm1, %xmm14
2010 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2011 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2012 ; SSE-NEXT: movdqa %xmm12, %xmm0
2013 ; SSE-NEXT: pand %xmm10, %xmm0
2014 ; SSE-NEXT: pandn %xmm8, %xmm10
2015 ; SSE-NEXT: por %xmm0, %xmm10
2016 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,65535]
2017 ; SSE-NEXT: movdqa %xmm9, %xmm7
2018 ; SSE-NEXT: pand %xmm14, %xmm7
2019 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2020 ; SSE-NEXT: movdqa %xmm5, %xmm15
2021 ; SSE-NEXT: pand %xmm14, %xmm15
2022 ; SSE-NEXT: movdqa %xmm11, %xmm3
2023 ; SSE-NEXT: pandn %xmm8, %xmm3
2024 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2025 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm8[0,2,2,3]
2026 ; SSE-NEXT: movdqa %xmm8, %xmm4
2027 ; SSE-NEXT: pand %xmm14, %xmm8
2028 ; SSE-NEXT: movdqa %xmm14, %xmm9
2029 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2030 ; SSE-NEXT: pandn %xmm12, %xmm14
2031 ; SSE-NEXT: por %xmm8, %xmm14
2032 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2033 ; SSE-NEXT: movdqa %xmm0, %xmm5
2034 ; SSE-NEXT: pslld $16, %xmm5
2035 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2036 ; SSE-NEXT: movdqa %xmm8, %xmm3
2037 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2038 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2039 ; SSE-NEXT: movdqa %xmm8, %xmm1
2040 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2041 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2042 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2043 ; SSE-NEXT: pxor %xmm1, %xmm1
2044 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
2045 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,7]
2046 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2047 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,5]
2048 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,0,65535]
2049 ; SSE-NEXT: pand %xmm12, %xmm10
2050 ; SSE-NEXT: movdqa %xmm8, %xmm2
2051 ; SSE-NEXT: pand %xmm12, %xmm2
2052 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2053 ; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill
2054 ; SSE-NEXT: pandn %xmm0, %xmm12
2055 ; SSE-NEXT: movdqa %xmm0, %xmm2
2056 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2057 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3],xmm14[4],xmm1[4],xmm14[5],xmm1[5],xmm14[6],xmm1[6],xmm14[7],xmm1[7]
2058 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
2059 ; SSE-NEXT: pand %xmm0, %xmm14
2060 ; SSE-NEXT: pand %xmm0, %xmm2
2061 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2062 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2063 ; SSE-NEXT: pandn %xmm8, %xmm0
2064 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2065 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
2066 ; SSE-NEXT: packuswb %xmm8, %xmm5
2067 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
2068 ; SSE-NEXT: movdqa %xmm0, %xmm8
2069 ; SSE-NEXT: pandn %xmm5, %xmm8
2070 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2071 ; SSE-NEXT: # xmm5 = mem[0,3,2,3]
2072 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,3,2,4,5,6,7]
2073 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,6,7]
2074 ; SSE-NEXT: packuswb %xmm5, %xmm5
2075 ; SSE-NEXT: pand %xmm0, %xmm5
2076 ; SSE-NEXT: por %xmm5, %xmm8
2077 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
2078 ; SSE-NEXT: movdqa %xmm5, %xmm0
2079 ; SSE-NEXT: pandn %xmm8, %xmm0
2080 ; SSE-NEXT: pand %xmm5, %xmm13
2081 ; SSE-NEXT: por %xmm13, %xmm0
2082 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2083 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
2084 ; SSE-NEXT: movdqa %xmm2, %xmm8
2085 ; SSE-NEXT: pandn %xmm6, %xmm8
2086 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2087 ; SSE-NEXT: pand %xmm2, %xmm0
2088 ; SSE-NEXT: por %xmm8, %xmm0
2089 ; SSE-NEXT: movdqa %xmm0, %xmm8
2090 ; SSE-NEXT: pxor %xmm6, %xmm6
2091 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
2092 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,0,65535]
2093 ; SSE-NEXT: movdqa %xmm13, %xmm1
2094 ; SSE-NEXT: pandn %xmm8, %xmm1
2095 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
2096 ; SSE-NEXT: pxor %xmm8, %xmm8
2097 ; SSE-NEXT: pand %xmm13, %xmm0
2098 ; SSE-NEXT: por %xmm1, %xmm0
2099 ; SSE-NEXT: packuswb %xmm3, %xmm1
2100 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
2101 ; SSE-NEXT: movdqa %xmm6, %xmm3
2102 ; SSE-NEXT: pandn %xmm1, %xmm3
2103 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
2104 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
2105 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2106 ; SSE-NEXT: packuswb %xmm0, %xmm0
2107 ; SSE-NEXT: pand %xmm6, %xmm0
2108 ; SSE-NEXT: por %xmm0, %xmm3
2109 ; SSE-NEXT: movdqa %xmm5, %xmm0
2110 ; SSE-NEXT: pandn %xmm3, %xmm0
2111 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2112 ; SSE-NEXT: pandn %xmm6, %xmm9
2113 ; SSE-NEXT: por %xmm9, %xmm7
2114 ; SSE-NEXT: movdqa %xmm7, %xmm1
2115 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
2116 ; SSE-NEXT: movdqa %xmm13, %xmm3
2117 ; SSE-NEXT: pandn %xmm1, %xmm3
2118 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
2119 ; SSE-NEXT: pand %xmm13, %xmm7
2120 ; SSE-NEXT: por %xmm3, %xmm7
2121 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2122 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,3,2,3]
2123 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
2124 ; SSE-NEXT: movdqa %xmm11, %xmm1
2125 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
2126 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3],xmm11[4],xmm8[4],xmm11[5],xmm8[5],xmm11[6],xmm8[6],xmm11[7],xmm8[7]
2127 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,65535]
2128 ; SSE-NEXT: pand %xmm3, %xmm11
2129 ; SSE-NEXT: pandn %xmm1, %xmm3
2130 ; SSE-NEXT: por %xmm11, %xmm3
2131 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2132 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2133 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
2134 ; SSE-NEXT: packuswb %xmm1, %xmm3
2135 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
2136 ; SSE-NEXT: movdqa %xmm13, %xmm8
2137 ; SSE-NEXT: pandn %xmm3, %xmm8
2138 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,1,3]
2139 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2140 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
2141 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
2142 ; SSE-NEXT: packuswb %xmm1, %xmm1
2143 ; SSE-NEXT: pand %xmm13, %xmm1
2144 ; SSE-NEXT: por %xmm1, %xmm8
2145 ; SSE-NEXT: pand %xmm5, %xmm8
2146 ; SSE-NEXT: por %xmm0, %xmm8
2147 ; SSE-NEXT: movdqa %xmm2, %xmm0
2148 ; SSE-NEXT: pandn %xmm9, %xmm0
2149 ; SSE-NEXT: pand %xmm2, %xmm4
2150 ; SSE-NEXT: por %xmm0, %xmm4
2151 ; SSE-NEXT: movdqa %xmm4, %xmm0
2152 ; SSE-NEXT: pxor %xmm1, %xmm1
2153 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2154 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2155 ; SSE-NEXT: pxor %xmm2, %xmm2
2156 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2157 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7]
2158 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2159 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,7,6]
2160 ; SSE-NEXT: psrlq $48, %xmm0
2161 ; SSE-NEXT: packuswb %xmm0, %xmm3
2162 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,0,65535,65535]
2163 ; SSE-NEXT: movdqa %xmm1, %xmm0
2164 ; SSE-NEXT: movdqa %xmm6, %xmm7
2165 ; SSE-NEXT: pandn %xmm6, %xmm0
2166 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2167 ; SSE-NEXT: movdqa %xmm9, %xmm4
2168 ; SSE-NEXT: pand %xmm1, %xmm4
2169 ; SSE-NEXT: por %xmm0, %xmm4
2170 ; SSE-NEXT: movdqa %xmm4, %xmm0
2171 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2172 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,65535,65535,65535]
2173 ; SSE-NEXT: movdqa %xmm1, %xmm6
2174 ; SSE-NEXT: pandn %xmm0, %xmm6
2175 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
2176 ; SSE-NEXT: pand %xmm1, %xmm4
2177 ; SSE-NEXT: por %xmm6, %xmm4
2178 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[3,2,1,0,4,5,6,7]
2179 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,7,7,7,7]
2180 ; SSE-NEXT: packuswb %xmm4, %xmm4
2181 ; SSE-NEXT: pand %xmm13, %xmm4
2182 ; SSE-NEXT: pandn %xmm3, %xmm13
2183 ; SSE-NEXT: por %xmm13, %xmm4
2184 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2185 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2186 ; SSE-NEXT: pandn %xmm6, %xmm0
2187 ; SSE-NEXT: por %xmm0, %xmm15
2188 ; SSE-NEXT: movdqa %xmm15, %xmm0
2189 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2190 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
2191 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
2192 ; SSE-NEXT: pand %xmm3, %xmm15
2193 ; SSE-NEXT: pandn %xmm0, %xmm3
2194 ; SSE-NEXT: por %xmm15, %xmm3
2195 ; SSE-NEXT: movdqa %xmm3, %xmm11
2196 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2197 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[2,1,2,3,4,5,6,7]
2198 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2199 ; SSE-NEXT: packuswb %xmm0, %xmm0
2200 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
2201 ; SSE-NEXT: movdqa %xmm2, %xmm3
2202 ; SSE-NEXT: pandn %xmm0, %xmm3
2203 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,2,1,0,4,5,6,7]
2204 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
2205 ; SSE-NEXT: packuswb %xmm0, %xmm0
2206 ; SSE-NEXT: pand %xmm2, %xmm0
2207 ; SSE-NEXT: por %xmm0, %xmm3
2208 ; SSE-NEXT: movdqa %xmm5, %xmm15
2209 ; SSE-NEXT: pandn %xmm3, %xmm15
2210 ; SSE-NEXT: pand %xmm5, %xmm4
2211 ; SSE-NEXT: por %xmm4, %xmm15
2212 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
2213 ; SSE-NEXT: movdqa %xmm0, %xmm3
2214 ; SSE-NEXT: pandn %xmm7, %xmm3
2215 ; SSE-NEXT: movdqa %xmm9, %xmm4
2216 ; SSE-NEXT: pand %xmm0, %xmm4
2217 ; SSE-NEXT: por %xmm3, %xmm4
2218 ; SSE-NEXT: movdqa %xmm4, %xmm3
2219 ; SSE-NEXT: pxor %xmm0, %xmm0
2220 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2221 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2222 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2223 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2224 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2225 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2226 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2227 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2228 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
2229 ; SSE-NEXT: pandn %xmm2, %xmm3
2230 ; SSE-NEXT: por %xmm3, %xmm10
2231 ; SSE-NEXT: packuswb %xmm2, %xmm10
2232 ; SSE-NEXT: packuswb %xmm4, %xmm4
2233 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,3,3]
2234 ; SSE-NEXT: movss {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
2235 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2236 ; SSE-NEXT: movdqa %xmm7, %xmm3
2237 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,65535,0,65535,65535]
2238 ; SSE-NEXT: pand %xmm4, %xmm3
2239 ; SSE-NEXT: pandn %xmm6, %xmm4
2240 ; SSE-NEXT: movdqa %xmm6, %xmm11
2241 ; SSE-NEXT: por %xmm3, %xmm4
2242 ; SSE-NEXT: movdqa %xmm4, %xmm3
2243 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
2244 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
2245 ; SSE-NEXT: pxor %xmm10, %xmm10
2246 ; SSE-NEXT: pand %xmm1, %xmm4
2247 ; SSE-NEXT: pandn %xmm3, %xmm1
2248 ; SSE-NEXT: por %xmm4, %xmm1
2249 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2250 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2251 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2252 ; SSE-NEXT: packuswb %xmm1, %xmm1
2253 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
2254 ; SSE-NEXT: pand %xmm0, %xmm1
2255 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,2,1]
2256 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2257 ; SSE-NEXT: packuswb %xmm3, %xmm3
2258 ; SSE-NEXT: pandn %xmm3, %xmm0
2259 ; SSE-NEXT: por %xmm1, %xmm0
2260 ; SSE-NEXT: movdqa %xmm5, %xmm1
2261 ; SSE-NEXT: pandn %xmm0, %xmm1
2262 ; SSE-NEXT: andps %xmm5, %xmm2
2263 ; SSE-NEXT: por %xmm2, %xmm1
2264 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2265 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535]
2266 ; SSE-NEXT: pand %xmm13, %xmm2
2267 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2268 ; SSE-NEXT: movdqa %xmm2, %xmm3
2269 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2270 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2271 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,65535,0]
2272 ; SSE-NEXT: pand %xmm4, %xmm2
2273 ; SSE-NEXT: pandn %xmm3, %xmm4
2274 ; SSE-NEXT: por %xmm2, %xmm4
2275 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
2276 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,5,4,7,6]
2277 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2278 ; SSE-NEXT: packuswb %xmm3, %xmm6
2279 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
2280 ; SSE-NEXT: pand %xmm13, %xmm9
2281 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2282 ; SSE-NEXT: pandn %xmm3, %xmm13
2283 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
2284 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
2285 ; SSE-NEXT: movdqa %xmm2, %xmm3
2286 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2287 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
2288 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
2289 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2290 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
2291 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
2292 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2293 ; SSE-NEXT: packuswb %xmm2, %xmm2
2294 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm2[0],xmm6[1,2,3]
2295 ; SSE-NEXT: movdqa %xmm7, %xmm2
2296 ; SSE-NEXT: movdqa %xmm7, %xmm0
2297 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
2298 ; SSE-NEXT: pand %xmm3, %xmm2
2299 ; SSE-NEXT: pandn %xmm11, %xmm3
2300 ; SSE-NEXT: por %xmm2, %xmm3
2301 ; SSE-NEXT: movdqa %xmm3, %xmm2
2302 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
2303 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
2304 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,0,65535,65535,65535]
2305 ; SSE-NEXT: pand %xmm4, %xmm3
2306 ; SSE-NEXT: pandn %xmm2, %xmm4
2307 ; SSE-NEXT: por %xmm3, %xmm4
2308 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2309 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,1,0,3]
2310 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
2311 ; SSE-NEXT: packuswb %xmm2, %xmm2
2312 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2313 ; SSE-NEXT: movdqa %xmm3, %xmm7
2314 ; SSE-NEXT: pandn %xmm2, %xmm7
2315 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,0,3]
2316 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
2317 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2318 ; SSE-NEXT: packuswb %xmm2, %xmm2
2319 ; SSE-NEXT: pand %xmm3, %xmm2
2320 ; SSE-NEXT: por %xmm2, %xmm7
2321 ; SSE-NEXT: movdqa %xmm5, %xmm2
2322 ; SSE-NEXT: pandn %xmm7, %xmm2
2323 ; SSE-NEXT: andps %xmm5, %xmm6
2324 ; SSE-NEXT: por %xmm6, %xmm2
2325 ; SSE-NEXT: movdqa %xmm13, %xmm7
2326 ; SSE-NEXT: por %xmm9, %xmm7
2327 ; SSE-NEXT: movdqa %xmm7, %xmm4
2328 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
2329 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
2330 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
2331 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,1,2,1]
2332 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
2333 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
2334 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2335 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2336 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2337 ; SSE-NEXT: pandn %xmm4, %xmm9
2338 ; SSE-NEXT: movdqa %xmm4, %xmm7
2339 ; SSE-NEXT: por %xmm9, %xmm14
2340 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,2,1,3]
2341 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2342 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
2343 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
2344 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
2345 ; SSE-NEXT: packuswb %xmm7, %xmm4
2346 ; SSE-NEXT: packuswb %xmm6, %xmm6
2347 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm6[0],xmm4[1,2,3]
2348 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2349 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2350 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,3,2,3]
2351 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
2352 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2353 ; SSE-NEXT: movdqa %xmm7, %xmm6
2354 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
2355 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
2356 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535]
2357 ; SSE-NEXT: pand %xmm9, %xmm7
2358 ; SSE-NEXT: pandn %xmm6, %xmm9
2359 ; SSE-NEXT: por %xmm7, %xmm9
2360 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[2,1,1,1]
2361 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
2362 ; SSE-NEXT: packuswb %xmm6, %xmm6
2363 ; SSE-NEXT: pand %xmm3, %xmm6
2364 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[0,1,0,3]
2365 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
2366 ; SSE-NEXT: packuswb %xmm7, %xmm7
2367 ; SSE-NEXT: pandn %xmm7, %xmm3
2368 ; SSE-NEXT: por %xmm3, %xmm6
2369 ; SSE-NEXT: andps %xmm5, %xmm4
2370 ; SSE-NEXT: pandn %xmm6, %xmm5
2371 ; SSE-NEXT: por %xmm4, %xmm5
2372 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2373 ; SSE-NEXT: movaps %xmm3, (%rsi)
2374 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2375 ; SSE-NEXT: movaps %xmm0, (%rdx)
2376 ; SSE-NEXT: movdqa %xmm8, (%rcx)
2377 ; SSE-NEXT: movdqa %xmm15, (%r8)
2378 ; SSE-NEXT: movdqa %xmm1, (%r9)
2379 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2380 ; SSE-NEXT: movdqa %xmm2, (%rax)
2381 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2382 ; SSE-NEXT: movdqa %xmm5, (%rax)
2383 ; SSE-NEXT: addq $168, %rsp
2386 ; AVX-LABEL: load_i8_stride7_vf16:
2388 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
2389 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm7
2390 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
2391 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
2392 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u]
2393 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2394 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
2395 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
2396 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
2397 ; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1
2398 ; AVX-NEXT: vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
2399 ; AVX-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
2400 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2401 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1
2402 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm5
2403 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm6
2404 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
2405 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2406 ; AVX-NEXT: vpor %xmm8, %xmm9, %xmm8
2407 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
2408 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
2409 ; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
2410 ; AVX-NEXT: vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
2411 ; AVX-NEXT: vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
2412 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
2413 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
2414 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
2415 ; AVX-NEXT: vpxor %xmm12, %xmm12, %xmm12
2416 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm12[7]
2417 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[3,10]
2418 ; AVX-NEXT: vpor %xmm10, %xmm9, %xmm10
2419 ; AVX-NEXT: vpmovsxwq {{.*#+}} xmm9 = [18446744073709551615,255]
2420 ; AVX-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm0
2421 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2422 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2423 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
2424 ; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10
2425 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u]
2426 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[u,u,u,u,u,u,u]
2427 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
2428 ; AVX-NEXT: vpblendvb %xmm11, %xmm10, %xmm13, %xmm10
2429 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
2430 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15]
2431 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2432 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7]
2433 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[4,11]
2434 ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13
2435 ; AVX-NEXT: vpblendvb %xmm9, %xmm10, %xmm13, %xmm10
2436 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
2437 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
2438 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
2439 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
2440 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
2441 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2442 ; AVX-NEXT: vpblendvb %xmm11, %xmm13, %xmm14, %xmm11
2443 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm5[u,u]
2444 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14,u,u]
2445 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
2446 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm12[7]
2447 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12]
2448 ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13
2449 ; AVX-NEXT: vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
2450 ; AVX-NEXT: vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2451 ; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm14
2452 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2453 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2454 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u]
2455 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u]
2456 ; AVX-NEXT: vpor %xmm0, %xmm15, %xmm0
2457 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
2458 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm5[u,u]
2459 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15,u,u]
2460 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2461 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3,4,5,6],xmm12[7]
2462 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[6,13]
2463 ; AVX-NEXT: vpor %xmm14, %xmm12, %xmm12
2464 ; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm12, %xmm12
2465 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2466 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2467 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
2468 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u]
2469 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
2470 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2471 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3,4,5,6,7]
2472 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u]
2473 ; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm5[u,u,u]
2474 ; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
2475 ; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
2476 ; AVX-NEXT: # xmm15 = mem[0,0]
2477 ; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14
2478 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[0,7,14]
2479 ; AVX-NEXT: vpor %xmm8, %xmm14, %xmm8
2480 ; AVX-NEXT: vpblendvb %xmm9, %xmm0, %xmm8, %xmm0
2481 ; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7
2482 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
2483 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
2484 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
2485 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
2486 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
2487 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3,4,5,6,7]
2488 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[3,10,u,u,u]
2489 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm5[u,u,u]
2490 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
2491 ; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3
2492 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[1,8,15]
2493 ; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
2494 ; AVX-NEXT: vpblendvb %xmm9, %xmm2, %xmm3, %xmm2
2495 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u]
2496 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
2497 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2498 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9]
2499 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15]
2500 ; AVX-NEXT: vpblendw $31, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2501 ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4],xmm1[5,6,7]
2502 ; AVX-NEXT: vmovdqa %xmm1, (%rsi)
2503 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2504 ; AVX-NEXT: vmovaps %xmm1, (%rdx)
2505 ; AVX-NEXT: vmovdqa %xmm10, (%rcx)
2506 ; AVX-NEXT: vmovdqa %xmm11, (%r8)
2507 ; AVX-NEXT: vmovdqa %xmm12, (%r9)
2508 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2509 ; AVX-NEXT: vmovdqa %xmm0, (%rax)
2510 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2511 ; AVX-NEXT: vmovdqa %xmm2, (%rax)
2514 ; AVX2-LABEL: load_i8_stride7_vf16:
2516 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2517 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
2518 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2519 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2520 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2521 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2522 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2523 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2524 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2525 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm3
2526 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm9
2527 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm10
2528 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2529 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2530 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
2531 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2532 ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
2533 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2534 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2535 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2536 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
2537 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2538 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2539 ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm6
2540 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm4
2541 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5
2542 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2543 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2544 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2545 ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm8
2546 ; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2547 ; AVX2-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2548 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2549 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2550 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2551 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm8
2552 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2553 ; AVX2-NEXT: vpor %xmm11, %xmm8, %xmm8
2554 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2555 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2556 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2557 ; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11
2558 ; AVX2-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2559 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2560 ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2561 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2562 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11
2563 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2564 ; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11
2565 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2566 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2567 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2568 ; AVX2-NEXT: vpor %xmm12, %xmm9, %xmm9
2569 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2570 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2571 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2572 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2573 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2574 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2575 ; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11
2576 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2577 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2578 ; AVX2-NEXT: vpor %xmm12, %xmm10, %xmm10
2579 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2580 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2581 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2582 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2583 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2584 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2585 ; AVX2-NEXT: vpor %xmm12, %xmm11, %xmm11
2586 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2587 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2588 ; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2589 ; AVX2-NEXT: vpor %xmm13, %xmm12, %xmm12
2590 ; AVX2-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2591 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2592 ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2593 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2594 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2595 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2596 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2597 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2598 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2599 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2600 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
2601 ; AVX2-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2602 ; AVX2-NEXT: vmovdqa %xmm3, (%rsi)
2603 ; AVX2-NEXT: vmovdqa %xmm6, (%rdx)
2604 ; AVX2-NEXT: vmovdqa %xmm8, (%rcx)
2605 ; AVX2-NEXT: vmovdqa %xmm9, (%r8)
2606 ; AVX2-NEXT: vmovdqa %xmm10, (%r9)
2607 ; AVX2-NEXT: vmovdqa %xmm11, (%r10)
2608 ; AVX2-NEXT: vmovdqa %xmm0, (%rax)
2609 ; AVX2-NEXT: vzeroupper
2612 ; AVX2-FP-LABEL: load_i8_stride7_vf16:
2614 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2615 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2616 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
2617 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
2618 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2619 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2620 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2621 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2622 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2623 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm3
2624 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm9
2625 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm10
2626 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2627 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2628 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2
2629 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2630 ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
2631 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2632 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2633 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2634 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
2635 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2636 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2637 ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm6
2638 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4
2639 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5
2640 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2641 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2642 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2643 ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm8
2644 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2645 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2646 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2647 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2648 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2649 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm8
2650 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2651 ; AVX2-FP-NEXT: vpor %xmm11, %xmm8, %xmm8
2652 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2653 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2654 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2655 ; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11
2656 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2657 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2658 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2659 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2660 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11
2661 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2662 ; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11
2663 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2664 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2665 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2666 ; AVX2-FP-NEXT: vpor %xmm12, %xmm9, %xmm9
2667 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2668 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2669 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2670 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
2671 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2672 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2673 ; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11
2674 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2675 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2676 ; AVX2-FP-NEXT: vpor %xmm12, %xmm10, %xmm10
2677 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2678 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2679 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2680 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
2681 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2682 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2683 ; AVX2-FP-NEXT: vpor %xmm12, %xmm11, %xmm11
2684 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2685 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2686 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2687 ; AVX2-FP-NEXT: vpor %xmm13, %xmm12, %xmm12
2688 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2689 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2690 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2691 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2692 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
2693 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2694 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
2695 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2696 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2697 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2698 ; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1
2699 ; AVX2-FP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2700 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsi)
2701 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx)
2702 ; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx)
2703 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%r8)
2704 ; AVX2-FP-NEXT: vmovdqa %xmm10, (%r9)
2705 ; AVX2-FP-NEXT: vmovdqa %xmm11, (%r10)
2706 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax)
2707 ; AVX2-FP-NEXT: vzeroupper
2708 ; AVX2-FP-NEXT: retq
2710 ; AVX2-FCP-LABEL: load_i8_stride7_vf16:
2711 ; AVX2-FCP: # %bb.0:
2712 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2713 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2714 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
2715 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
2716 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2717 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
2718 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
2719 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
2720 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u]
2721 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm3
2722 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm9
2723 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm10
2724 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm10[1],xmm9[2],xmm10[3]
2725 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9]
2726 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
2727 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
2728 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
2729 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2730 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2731 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
2732 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
2733 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
2734 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
2735 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm6
2736 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
2737 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
2738 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4],xmm4[5],xmm5[6,7]
2739 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
2740 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero
2741 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm8
2742 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2743 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
2744 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2745 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
2746 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
2747 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
2748 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
2749 ; AVX2-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8
2750 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
2751 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2752 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
2753 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
2754 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
2755 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
2756 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
2757 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
2758 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
2759 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
2760 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
2761 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
2762 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm10[5,12]
2763 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
2764 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm9, %xmm9
2765 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
2766 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
2767 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2768 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
2769 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2770 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u,u,u]
2771 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
2772 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13]
2773 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
2774 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
2775 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
2776 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
2777 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
2778 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
2779 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
2780 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u,u,u,u]
2781 ; AVX2-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
2782 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6],xmm4[7]
2783 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[0,7,14]
2784 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
2785 ; AVX2-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
2786 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
2787 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
2788 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
2789 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
2790 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
2791 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
2792 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
2793 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm5[1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
2794 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
2795 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[3,10],zero,zero,zero
2796 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
2797 ; AVX2-FCP-NEXT: vpblendvb %xmm7, %xmm0, %xmm1, %xmm0
2798 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsi)
2799 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx)
2800 ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx)
2801 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%r8)
2802 ; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r9)
2803 ; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r10)
2804 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax)
2805 ; AVX2-FCP-NEXT: vzeroupper
2806 ; AVX2-FCP-NEXT: retq
2808 ; AVX512-LABEL: load_i8_stride7_vf16:
2810 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2811 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
2812 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
2813 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
2814 ; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4
2815 ; AVX512-NEXT: vmovdqa (%rdi), %ymm1
2816 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
2817 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
2818 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5
2819 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
2820 ; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2821 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2822 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
2823 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
2824 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
2825 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
2826 ; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7
2827 ; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5
2828 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
2829 ; AVX512-NEXT: vpor %xmm6, %xmm5, %xmm5
2830 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
2831 ; AVX512-NEXT: vmovdqa %ymm8, %ymm6
2832 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
2833 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
2834 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
2835 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
2836 ; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm9
2837 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
2838 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
2839 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
2840 ; AVX512-NEXT: vpor %xmm7, %xmm6, %xmm6
2841 ; AVX512-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2842 ; AVX512-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6
2843 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
2844 ; AVX512-NEXT: vmovdqa %ymm9, %ymm10
2845 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10
2846 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
2847 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10
2848 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
2849 ; AVX512-NEXT: vpor %xmm11, %xmm10, %xmm10
2850 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2851 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2852 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
2853 ; AVX512-NEXT: vpor %xmm12, %xmm11, %xmm11
2854 ; AVX512-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11
2855 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
2856 ; AVX512-NEXT: vmovdqa %ymm10, %ymm12
2857 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12
2858 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
2859 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
2860 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
2861 ; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
2862 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
2863 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
2864 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
2865 ; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14
2866 ; AVX512-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14
2867 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8
2868 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12
2869 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2870 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
2871 ; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
2872 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
2873 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
2874 ; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
2875 ; AVX512-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12
2876 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9
2877 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm8
2878 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
2879 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
2880 ; AVX512-NEXT: vpor %xmm8, %xmm9, %xmm8
2881 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
2882 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
2883 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
2884 ; AVX512-NEXT: vpor %xmm13, %xmm9, %xmm9
2885 ; AVX512-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9
2886 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10
2887 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
2888 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm2
2889 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
2890 ; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1
2891 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
2892 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
2893 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
2894 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
2895 ; AVX512-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0
2896 ; AVX512-NEXT: vmovdqa %xmm5, (%rsi)
2897 ; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
2898 ; AVX512-NEXT: vmovdqa %xmm11, (%rcx)
2899 ; AVX512-NEXT: vmovdqa %xmm14, (%r8)
2900 ; AVX512-NEXT: vmovdqa %xmm12, (%r9)
2901 ; AVX512-NEXT: vmovdqa %xmm9, (%r10)
2902 ; AVX512-NEXT: vmovdqa %xmm0, (%rax)
2903 ; AVX512-NEXT: vzeroupper
2906 ; AVX512-FCP-LABEL: load_i8_stride7_vf16:
2907 ; AVX512-FCP: # %bb.0:
2908 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2909 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2910 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
2911 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
2912 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
2913 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
2914 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
2915 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
2916 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5
2917 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
2918 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
2919 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
2920 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
2921 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
2922 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
2923 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
2924 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
2925 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
2926 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
2927 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
2928 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
2929 ; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm6
2930 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
2931 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
2932 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
2933 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
2934 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9
2935 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
2936 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
2937 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
2938 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
2939 ; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
2940 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6
2941 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
2942 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm10
2943 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10
2944 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
2945 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
2946 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
2947 ; AVX512-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
2948 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
2949 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
2950 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
2951 ; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
2952 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11
2953 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
2954 ; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm12
2955 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12
2956 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
2957 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
2958 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
2959 ; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
2960 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
2961 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
2962 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
2963 ; AVX512-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
2964 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14
2965 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8
2966 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12
2967 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
2968 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
2969 ; AVX512-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8
2970 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
2971 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
2972 ; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
2973 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12
2974 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9
2975 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8
2976 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
2977 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
2978 ; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8
2979 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
2980 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
2981 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
2982 ; AVX512-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9
2983 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9
2984 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10
2985 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
2986 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2
2987 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
2988 ; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
2989 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
2990 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
2991 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
2992 ; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
2993 ; AVX512-FCP-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0
2994 ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rsi)
2995 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx)
2996 ; AVX512-FCP-NEXT: vmovdqa %xmm11, (%rcx)
2997 ; AVX512-FCP-NEXT: vmovdqa %xmm14, (%r8)
2998 ; AVX512-FCP-NEXT: vmovdqa %xmm12, (%r9)
2999 ; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r10)
3000 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
3001 ; AVX512-FCP-NEXT: vzeroupper
3002 ; AVX512-FCP-NEXT: retq
3004 ; AVX512DQ-LABEL: load_i8_stride7_vf16:
3005 ; AVX512DQ: # %bb.0:
3006 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3007 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
3008 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3009 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
3010 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3011 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
3012 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2
3013 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
3014 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5
3015 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
3016 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3017 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3018 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
3019 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
3020 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3021 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3022 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3023 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3024 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3025 ; AVX512DQ-NEXT: vpor %xmm6, %xmm5, %xmm5
3026 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
3027 ; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6
3028 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
3029 ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
3030 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3031 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3032 ; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm9
3033 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3034 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
3035 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3036 ; AVX512DQ-NEXT: vpor %xmm7, %xmm6, %xmm6
3037 ; AVX512DQ-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
3038 ; AVX512DQ-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6
3039 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
3040 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm10
3041 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10
3042 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
3043 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10
3044 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
3045 ; AVX512DQ-NEXT: vpor %xmm11, %xmm10, %xmm10
3046 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3047 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
3048 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3049 ; AVX512DQ-NEXT: vpor %xmm12, %xmm11, %xmm11
3050 ; AVX512DQ-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11
3051 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
3052 ; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm12
3053 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12
3054 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
3055 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
3056 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
3057 ; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12
3058 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3059 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
3060 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3061 ; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14
3062 ; AVX512DQ-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14
3063 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8
3064 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12
3065 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
3066 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
3067 ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
3068 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
3069 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3070 ; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12
3071 ; AVX512DQ-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12
3072 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9
3073 ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm8
3074 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
3075 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3076 ; AVX512DQ-NEXT: vpor %xmm8, %xmm9, %xmm8
3077 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3078 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
3079 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3080 ; AVX512DQ-NEXT: vpor %xmm13, %xmm9, %xmm9
3081 ; AVX512DQ-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9
3082 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10
3083 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
3084 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm2
3085 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
3086 ; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1
3087 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3088 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3089 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3090 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
3091 ; AVX512DQ-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0
3092 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsi)
3093 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx)
3094 ; AVX512DQ-NEXT: vmovdqa %xmm11, (%rcx)
3095 ; AVX512DQ-NEXT: vmovdqa %xmm14, (%r8)
3096 ; AVX512DQ-NEXT: vmovdqa %xmm12, (%r9)
3097 ; AVX512DQ-NEXT: vmovdqa %xmm9, (%r10)
3098 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax)
3099 ; AVX512DQ-NEXT: vzeroupper
3100 ; AVX512DQ-NEXT: retq
3102 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf16:
3103 ; AVX512DQ-FCP: # %bb.0:
3104 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3105 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3106 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3107 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
3108 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3109 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
3110 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
3111 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
3112 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm5
3113 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3114 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3115 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3116 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
3117 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
3118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3119 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3120 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3121 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3122 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3123 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3124 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
3125 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm6
3126 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm6
3127 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
3128 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3129 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3130 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm9
3131 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3132 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
3133 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3134 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
3135 ; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
3136 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm9, %xmm7, %xmm6
3137 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
3138 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm10
3139 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm10
3140 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u,u,u]
3141 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
3142 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u,u,u]
3143 ; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3144 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3145 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11]
3146 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3147 ; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
3148 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm10, %xmm7, %xmm11
3149 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
3150 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm12
3151 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm12
3152 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
3153 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
3154 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
3155 ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
3156 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3157 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm13[5,12]
3158 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3159 ; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
3160 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm12, %xmm7, %xmm14
3161 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm8
3162 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12
3163 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
3164 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
3165 ; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm8, %xmm8
3166 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm13[6,13]
3167 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3168 ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
3169 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm12
3170 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm9
3171 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8
3172 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
3173 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3174 ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8
3175 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3176 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14]
3177 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3178 ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm9, %xmm9
3179 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm8, %xmm7, %xmm9
3180 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm10
3181 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
3182 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm2
3183 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
3184 ; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
3185 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3186 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3187 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3188 ; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
3189 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %xmm1, %xmm7, %xmm0
3190 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rsi)
3191 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx)
3192 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%rcx)
3193 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%r8)
3194 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, (%r9)
3195 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r10)
3196 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
3197 ; AVX512DQ-FCP-NEXT: vzeroupper
3198 ; AVX512DQ-FCP-NEXT: retq
3200 ; AVX512BW-LABEL: load_i8_stride7_vf16:
3201 ; AVX512BW: # %bb.0:
3202 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3203 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3204 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3205 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm0
3206 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3207 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2
3208 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
3209 ; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122
3210 ; AVX512BW-NEXT: kmovd %r11d, %k1
3211 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3212 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
3213 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3214 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3215 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
3216 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4
3217 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3218 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3219 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3220 ; AVX512BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3221 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3222 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
3223 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224
3224 ; AVX512BW-NEXT: kmovd %edi, %k2
3225 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3226 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
3227 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3228 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3229 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
3230 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3231 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3232 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3233 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
3234 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
3235 ; AVX512BW-NEXT: kmovd %edi, %k1
3236 ; AVX512BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
3237 ; AVX512BW-NEXT: movw $8772, %di # imm = 0x2244
3238 ; AVX512BW-NEXT: kmovd %edi, %k3
3239 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3240 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3241 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7
3242 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3243 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
3244 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3245 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3246 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3247 ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3248 ; AVX512BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
3249 ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448
3250 ; AVX512BW-NEXT: kmovd %edi, %k4
3251 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3252 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3253 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8
3254 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3255 ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3256 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3257 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3258 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3259 ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3260 ; AVX512BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
3261 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3262 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11
3263 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3264 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3265 ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3266 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3267 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3268 ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9
3269 ; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
3270 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3271 ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11
3272 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3273 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3274 ; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm9
3275 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3276 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3277 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3278 ; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11
3279 ; AVX512BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
3280 ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
3281 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3282 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3283 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3284 ; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1
3285 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3286 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3287 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3288 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
3289 ; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3290 ; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi)
3291 ; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx)
3292 ; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx)
3293 ; AVX512BW-NEXT: vmovdqa %xmm8, (%r8)
3294 ; AVX512BW-NEXT: vmovdqa %xmm10, (%r9)
3295 ; AVX512BW-NEXT: vmovdqa %xmm9, (%r10)
3296 ; AVX512BW-NEXT: vmovdqa %xmm1, (%rax)
3297 ; AVX512BW-NEXT: vzeroupper
3298 ; AVX512BW-NEXT: retq
3300 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf16:
3301 ; AVX512BW-FCP: # %bb.0:
3302 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3303 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3304 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3305 ; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
3306 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3307 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2
3308 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
3309 ; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
3310 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
3311 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3312 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3313 ; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3314 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3315 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
3316 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
3317 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3318 ; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3319 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3320 ; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3321 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3322 ; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3323 ; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
3324 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
3325 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3326 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
3327 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3328 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3329 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
3330 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3331 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3332 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3333 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3334 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
3335 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
3336 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
3337 ; AVX512BW-FCP-NEXT: movw $8772, %di # imm = 0x2244
3338 ; AVX512BW-FCP-NEXT: kmovd %edi, %k3
3339 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3340 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3341 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
3342 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3343 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3344 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3345 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3346 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3347 ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3348 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
3349 ; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
3350 ; AVX512BW-FCP-NEXT: kmovd %edi, %k4
3351 ; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3352 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3353 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
3354 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3355 ; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3356 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3357 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3358 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3359 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3360 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
3361 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3362 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
3363 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3364 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3365 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3366 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3367 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3368 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
3369 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
3370 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3371 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
3372 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3373 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3374 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
3375 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3376 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3377 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3378 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
3379 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
3380 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
3381 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3382 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
3383 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3384 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
3385 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3386 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3387 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3388 ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
3389 ; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3390 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rsi)
3391 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
3392 ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
3393 ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
3394 ; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r9)
3395 ; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
3396 ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rax)
3397 ; AVX512BW-FCP-NEXT: vzeroupper
3398 ; AVX512BW-FCP-NEXT: retq
3400 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf16:
3401 ; AVX512DQ-BW: # %bb.0:
3402 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3403 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3404 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3405 ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm0
3406 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3407 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2
3408 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1
3409 ; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122
3410 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
3411 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3412 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm6
3413 ; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3414 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3415 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm3
3416 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4
3417 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3418 ; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3419 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3420 ; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3421 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3422 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm5, %xmm5
3423 ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224
3424 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
3425 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3426 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
3427 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3428 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3429 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
3430 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3431 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3432 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3433 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
3434 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00
3435 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
3436 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
3437 ; AVX512DQ-BW-NEXT: movw $8772, %di # imm = 0x2244
3438 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3
3439 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3440 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3441 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7
3442 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3443 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
3444 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3445 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3446 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3447 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3448 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
3449 ; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448
3450 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4
3451 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3452 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3453 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8
3454 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3455 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8
3456 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3457 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3458 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3459 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3460 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
3461 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3462 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm11
3463 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3464 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3465 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm10, %xmm10
3466 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3467 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3468 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9
3469 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
3470 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3471 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm11
3472 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3473 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3474 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm9
3475 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3476 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3477 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3478 ; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11
3479 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
3480 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
3481 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3482 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1
3483 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3484 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm1, %xmm1
3485 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3486 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3487 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3488 ; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm2, %xmm0
3489 ; AVX512DQ-BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3490 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rsi)
3491 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rdx)
3492 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rcx)
3493 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r8)
3494 ; AVX512DQ-BW-NEXT: vmovdqa %xmm10, (%r9)
3495 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r10)
3496 ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rax)
3497 ; AVX512DQ-BW-NEXT: vzeroupper
3498 ; AVX512DQ-BW-NEXT: retq
3500 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf16:
3501 ; AVX512DQ-BW-FCP: # %bb.0:
3502 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3503 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3504 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
3505 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
3506 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
3507 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2
3508 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
3509 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
3510 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
3511 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
3512 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
3513 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
3514 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
3515 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
3516 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
3517 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
3518 ; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
3519 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
3520 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
3521 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
3522 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
3523 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
3524 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
3525 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
3526 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
3527 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
3528 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
3529 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
3530 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
3531 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
3532 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
3533 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3534 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
3535 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
3536 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
3537 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %di # imm = 0x2244
3538 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3
3539 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
3540 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
3541 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
3542 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
3543 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
3544 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
3545 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
3546 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
3547 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3548 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
3549 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
3550 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4
3551 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
3552 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
3553 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
3554 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
3555 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
3556 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
3557 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
3558 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
3559 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3560 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
3561 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
3562 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
3563 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
3564 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
3565 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
3566 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
3567 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
3568 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
3569 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
3570 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
3571 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
3572 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
3573 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
3574 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
3575 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
3576 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
3577 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
3578 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
3579 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
3580 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
3581 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3582 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
3583 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
3584 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
3585 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
3586 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
3587 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
3588 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
3589 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
3590 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rsi)
3591 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
3592 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
3593 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
3594 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r9)
3595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
3596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rax)
3597 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3598 ; AVX512DQ-BW-FCP-NEXT: retq
3599 %wide.vec = load <112 x i8>, ptr %in.vec, align 64
3600 %strided.vec0 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
3601 %strided.vec1 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
3602 %strided.vec2 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
3603 %strided.vec3 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
3604 %strided.vec4 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
3605 %strided.vec5 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
3606 %strided.vec6 = shufflevector <112 x i8> %wide.vec, <112 x i8> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
3607 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
3608 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
3609 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
3610 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
3611 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
3612 store <16 x i8> %strided.vec5, ptr %out.vec5, align 64
3613 store <16 x i8> %strided.vec6, ptr %out.vec6, align 64
3617 define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
3618 ; SSE-LABEL: load_i8_stride7_vf32:
3620 ; SSE-NEXT: subq $648, %rsp # imm = 0x288
3621 ; SSE-NEXT: movdqa 208(%rdi), %xmm14
3622 ; SSE-NEXT: movdqa 192(%rdi), %xmm5
3623 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3624 ; SSE-NEXT: movdqa 176(%rdi), %xmm6
3625 ; SSE-NEXT: movdqa 112(%rdi), %xmm4
3626 ; SSE-NEXT: movdqa 128(%rdi), %xmm3
3627 ; SSE-NEXT: movdqa 160(%rdi), %xmm7
3628 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3629 ; SSE-NEXT: movdqa 144(%rdi), %xmm1
3630 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3631 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
3632 ; SSE-NEXT: movdqa %xmm2, %xmm0
3633 ; SSE-NEXT: pandn %xmm1, %xmm0
3634 ; SSE-NEXT: movdqa %xmm7, %xmm1
3635 ; SSE-NEXT: pand %xmm2, %xmm1
3636 ; SSE-NEXT: movdqa %xmm2, %xmm9
3637 ; SSE-NEXT: por %xmm0, %xmm1
3638 ; SSE-NEXT: pxor %xmm10, %xmm10
3639 ; SSE-NEXT: movdqa %xmm1, %xmm0
3640 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15]
3641 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3642 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3643 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
3644 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3645 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
3646 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3647 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
3648 ; SSE-NEXT: packuswb %xmm0, %xmm2
3649 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
3650 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535]
3651 ; SSE-NEXT: movdqa %xmm7, %xmm1
3652 ; SSE-NEXT: pandn %xmm3, %xmm1
3653 ; SSE-NEXT: movdqa %xmm3, %xmm11
3654 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3655 ; SSE-NEXT: movdqa %xmm4, %xmm3
3656 ; SSE-NEXT: movdqa %xmm4, %xmm12
3657 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3658 ; SSE-NEXT: pand %xmm7, %xmm3
3659 ; SSE-NEXT: movdqa %xmm7, %xmm8
3660 ; SSE-NEXT: por %xmm1, %xmm3
3661 ; SSE-NEXT: movdqa %xmm3, %xmm1
3662 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3663 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,65535]
3664 ; SSE-NEXT: movdqa %xmm7, %xmm4
3665 ; SSE-NEXT: pandn %xmm1, %xmm4
3666 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3667 ; SSE-NEXT: pand %xmm7, %xmm3
3668 ; SSE-NEXT: movdqa %xmm7, %xmm15
3669 ; SSE-NEXT: por %xmm4, %xmm3
3670 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
3671 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3672 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
3673 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
3674 ; SSE-NEXT: packuswb %xmm1, %xmm1
3675 ; SSE-NEXT: pand %xmm0, %xmm1
3676 ; SSE-NEXT: movdqa %xmm0, %xmm3
3677 ; SSE-NEXT: pandn %xmm2, %xmm3
3678 ; SSE-NEXT: por %xmm3, %xmm1
3679 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,0,65535]
3680 ; SSE-NEXT: movdqa %xmm7, %xmm2
3681 ; SSE-NEXT: pandn %xmm6, %xmm2
3682 ; SSE-NEXT: movdqa %xmm6, %xmm13
3683 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3684 ; SSE-NEXT: movdqa %xmm5, %xmm3
3685 ; SSE-NEXT: pand %xmm7, %xmm3
3686 ; SSE-NEXT: por %xmm2, %xmm3
3687 ; SSE-NEXT: movdqa %xmm3, %xmm2
3688 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3689 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
3690 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3691 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
3692 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
3693 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3694 ; SSE-NEXT: movdqa %xmm14, %xmm3
3695 ; SSE-NEXT: movdqa %xmm14, %xmm4
3696 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
3697 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3698 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3699 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3700 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
3701 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
3702 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
3703 ; SSE-NEXT: packuswb %xmm3, %xmm3
3704 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
3705 ; SSE-NEXT: movdqa %xmm5, %xmm4
3706 ; SSE-NEXT: pandn %xmm3, %xmm4
3707 ; SSE-NEXT: packuswb %xmm2, %xmm2
3708 ; SSE-NEXT: pand %xmm5, %xmm2
3709 ; SSE-NEXT: movdqa %xmm5, %xmm6
3710 ; SSE-NEXT: por %xmm2, %xmm4
3711 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0]
3712 ; SSE-NEXT: movdqa %xmm5, %xmm2
3713 ; SSE-NEXT: pandn %xmm4, %xmm2
3714 ; SSE-NEXT: pand %xmm5, %xmm1
3715 ; SSE-NEXT: por %xmm1, %xmm2
3716 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3717 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
3718 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3719 ; SSE-NEXT: movdqa %xmm9, %xmm1
3720 ; SSE-NEXT: pandn %xmm2, %xmm1
3721 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
3722 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3723 ; SSE-NEXT: pand %xmm9, %xmm2
3724 ; SSE-NEXT: por %xmm1, %xmm2
3725 ; SSE-NEXT: movdqa %xmm2, %xmm1
3726 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3727 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3728 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3729 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
3730 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3731 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
3732 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
3733 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3734 ; SSE-NEXT: packuswb %xmm1, %xmm2
3735 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
3736 ; SSE-NEXT: movdqa %xmm8, %xmm1
3737 ; SSE-NEXT: pandn %xmm3, %xmm1
3738 ; SSE-NEXT: movdqa %xmm3, %xmm14
3739 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3740 ; SSE-NEXT: movdqa (%rdi), %xmm4
3741 ; SSE-NEXT: movdqa %xmm4, %xmm3
3742 ; SSE-NEXT: movdqa %xmm4, %xmm9
3743 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3744 ; SSE-NEXT: pand %xmm8, %xmm3
3745 ; SSE-NEXT: por %xmm1, %xmm3
3746 ; SSE-NEXT: movdqa %xmm3, %xmm1
3747 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
3748 ; SSE-NEXT: movdqa %xmm15, %xmm4
3749 ; SSE-NEXT: pandn %xmm1, %xmm4
3750 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3751 ; SSE-NEXT: pand %xmm15, %xmm3
3752 ; SSE-NEXT: por %xmm4, %xmm3
3753 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
3754 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
3755 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
3756 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
3757 ; SSE-NEXT: packuswb %xmm1, %xmm1
3758 ; SSE-NEXT: pand %xmm0, %xmm1
3759 ; SSE-NEXT: pandn %xmm2, %xmm0
3760 ; SSE-NEXT: por %xmm0, %xmm1
3761 ; SSE-NEXT: movdqa 64(%rdi), %xmm2
3762 ; SSE-NEXT: movdqa %xmm7, %xmm0
3763 ; SSE-NEXT: pandn %xmm2, %xmm0
3764 ; SSE-NEXT: movdqa %xmm2, %xmm15
3765 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3766 ; SSE-NEXT: movdqa 80(%rdi), %xmm8
3767 ; SSE-NEXT: movdqa %xmm8, %xmm2
3768 ; SSE-NEXT: pand %xmm7, %xmm2
3769 ; SSE-NEXT: por %xmm0, %xmm2
3770 ; SSE-NEXT: movdqa %xmm2, %xmm0
3771 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
3772 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
3773 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3774 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3775 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
3776 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
3777 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
3778 ; SSE-NEXT: movdqa %xmm2, %xmm3
3779 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3780 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3781 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
3782 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3783 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
3784 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3785 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
3786 ; SSE-NEXT: packuswb %xmm2, %xmm2
3787 ; SSE-NEXT: movdqa %xmm6, %xmm3
3788 ; SSE-NEXT: pandn %xmm2, %xmm3
3789 ; SSE-NEXT: packuswb %xmm0, %xmm0
3790 ; SSE-NEXT: pand %xmm6, %xmm0
3791 ; SSE-NEXT: por %xmm0, %xmm3
3792 ; SSE-NEXT: pand %xmm5, %xmm1
3793 ; SSE-NEXT: pandn %xmm3, %xmm5
3794 ; SSE-NEXT: por %xmm1, %xmm5
3795 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3796 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
3797 ; SSE-NEXT: movdqa %xmm2, %xmm0
3798 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3799 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3800 ; SSE-NEXT: pand %xmm2, %xmm1
3801 ; SSE-NEXT: por %xmm0, %xmm1
3802 ; SSE-NEXT: movdqa %xmm1, %xmm2
3803 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3804 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
3805 ; SSE-NEXT: movdqa %xmm0, %xmm3
3806 ; SSE-NEXT: pandn %xmm2, %xmm3
3807 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3808 ; SSE-NEXT: pand %xmm0, %xmm1
3809 ; SSE-NEXT: por %xmm3, %xmm1
3810 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
3811 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
3812 ; SSE-NEXT: psrld $16, %xmm2
3813 ; SSE-NEXT: packuswb %xmm2, %xmm1
3814 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
3815 ; SSE-NEXT: movdqa %xmm5, %xmm2
3816 ; SSE-NEXT: pandn %xmm1, %xmm2
3817 ; SSE-NEXT: movdqa %xmm7, %xmm1
3818 ; SSE-NEXT: pandn %xmm11, %xmm1
3819 ; SSE-NEXT: movdqa %xmm12, %xmm3
3820 ; SSE-NEXT: pand %xmm7, %xmm3
3821 ; SSE-NEXT: movdqa %xmm7, %xmm12
3822 ; SSE-NEXT: por %xmm1, %xmm3
3823 ; SSE-NEXT: movdqa %xmm3, %xmm1
3824 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3],xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
3825 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535]
3826 ; SSE-NEXT: movdqa %xmm7, %xmm4
3827 ; SSE-NEXT: pandn %xmm1, %xmm4
3828 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
3829 ; SSE-NEXT: pand %xmm7, %xmm3
3830 ; SSE-NEXT: por %xmm4, %xmm3
3831 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
3832 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
3833 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
3834 ; SSE-NEXT: packuswb %xmm1, %xmm1
3835 ; SSE-NEXT: pand %xmm5, %xmm1
3836 ; SSE-NEXT: movdqa %xmm5, %xmm7
3837 ; SSE-NEXT: por %xmm2, %xmm1
3838 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,65535,0,65535]
3839 ; SSE-NEXT: movdqa %xmm4, %xmm2
3840 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3841 ; SSE-NEXT: movdqa %xmm13, %xmm3
3842 ; SSE-NEXT: pand %xmm4, %xmm3
3843 ; SSE-NEXT: movdqa %xmm4, %xmm13
3844 ; SSE-NEXT: por %xmm2, %xmm3
3845 ; SSE-NEXT: movdqa %xmm3, %xmm2
3846 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm10[8],xmm2[9],xmm10[9],xmm2[10],xmm10[10],xmm2[11],xmm10[11],xmm2[12],xmm10[12],xmm2[13],xmm10[13],xmm2[14],xmm10[14],xmm2[15],xmm10[15]
3847 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,65535]
3848 ; SSE-NEXT: movdqa %xmm11, %xmm4
3849 ; SSE-NEXT: pandn %xmm2, %xmm4
3850 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3],xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7]
3851 ; SSE-NEXT: pand %xmm11, %xmm3
3852 ; SSE-NEXT: por %xmm4, %xmm3
3853 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3854 ; SSE-NEXT: pslld $16, %xmm2
3855 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3856 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
3857 ; SSE-NEXT: packuswb %xmm4, %xmm2
3858 ; SSE-NEXT: movdqa %xmm6, %xmm4
3859 ; SSE-NEXT: pandn %xmm2, %xmm4
3860 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,3,2,3]
3861 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
3862 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
3863 ; SSE-NEXT: packuswb %xmm2, %xmm2
3864 ; SSE-NEXT: pand %xmm6, %xmm2
3865 ; SSE-NEXT: por %xmm2, %xmm4
3866 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
3867 ; SSE-NEXT: movdqa %xmm3, %xmm2
3868 ; SSE-NEXT: pandn %xmm4, %xmm2
3869 ; SSE-NEXT: pand %xmm3, %xmm1
3870 ; SSE-NEXT: por %xmm1, %xmm2
3871 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3872 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
3873 ; SSE-NEXT: movdqa %xmm5, %xmm1
3874 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3875 ; SSE-NEXT: pandn %xmm6, %xmm1
3876 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3877 ; SSE-NEXT: movdqa %xmm10, %xmm2
3878 ; SSE-NEXT: pand %xmm5, %xmm2
3879 ; SSE-NEXT: por %xmm1, %xmm2
3880 ; SSE-NEXT: movdqa %xmm2, %xmm1
3881 ; SSE-NEXT: pxor %xmm3, %xmm3
3882 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
3883 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3884 ; SSE-NEXT: pxor %xmm5, %xmm5
3885 ; SSE-NEXT: pand %xmm0, %xmm2
3886 ; SSE-NEXT: pandn %xmm1, %xmm0
3887 ; SSE-NEXT: por %xmm2, %xmm0
3888 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3889 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
3890 ; SSE-NEXT: psrld $16, %xmm1
3891 ; SSE-NEXT: packuswb %xmm1, %xmm0
3892 ; SSE-NEXT: movdqa %xmm7, %xmm4
3893 ; SSE-NEXT: movdqa %xmm7, %xmm1
3894 ; SSE-NEXT: pandn %xmm0, %xmm1
3895 ; SSE-NEXT: movdqa %xmm12, %xmm0
3896 ; SSE-NEXT: pandn %xmm14, %xmm0
3897 ; SSE-NEXT: movdqa %xmm9, %xmm2
3898 ; SSE-NEXT: pand %xmm12, %xmm2
3899 ; SSE-NEXT: por %xmm0, %xmm2
3900 ; SSE-NEXT: movdqa %xmm2, %xmm0
3901 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3902 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,0,65535]
3903 ; SSE-NEXT: movdqa %xmm7, %xmm3
3904 ; SSE-NEXT: pandn %xmm0, %xmm3
3905 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
3906 ; SSE-NEXT: pand %xmm7, %xmm2
3907 ; SSE-NEXT: por %xmm3, %xmm2
3908 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
3909 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
3910 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
3911 ; SSE-NEXT: packuswb %xmm0, %xmm0
3912 ; SSE-NEXT: pand %xmm4, %xmm0
3913 ; SSE-NEXT: por %xmm1, %xmm0
3914 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3915 ; SSE-NEXT: movdqa %xmm13, %xmm0
3916 ; SSE-NEXT: pandn %xmm8, %xmm0
3917 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3918 ; SSE-NEXT: movdqa %xmm15, %xmm1
3919 ; SSE-NEXT: pand %xmm13, %xmm1
3920 ; SSE-NEXT: por %xmm0, %xmm1
3921 ; SSE-NEXT: movdqa %xmm1, %xmm0
3922 ; SSE-NEXT: pxor %xmm2, %xmm2
3923 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
3924 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
3925 ; SSE-NEXT: movdqa %xmm11, %xmm2
3926 ; SSE-NEXT: pand %xmm11, %xmm1
3927 ; SSE-NEXT: pandn %xmm0, %xmm2
3928 ; SSE-NEXT: por %xmm1, %xmm2
3929 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3930 ; SSE-NEXT: movdqa %xmm13, %xmm0
3931 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3932 ; SSE-NEXT: pandn %xmm1, %xmm0
3933 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3934 ; SSE-NEXT: movdqa %xmm5, %xmm9
3935 ; SSE-NEXT: pand %xmm13, %xmm9
3936 ; SSE-NEXT: por %xmm0, %xmm9
3937 ; SSE-NEXT: movdqa %xmm6, %xmm0
3938 ; SSE-NEXT: pand %xmm13, %xmm0
3939 ; SSE-NEXT: pandn %xmm10, %xmm13
3940 ; SSE-NEXT: por %xmm0, %xmm13
3941 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3942 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
3943 ; SSE-NEXT: movdqa %xmm2, %xmm0
3944 ; SSE-NEXT: pandn %xmm5, %xmm0
3945 ; SSE-NEXT: movdqa %xmm12, %xmm7
3946 ; SSE-NEXT: movdqa %xmm12, %xmm5
3947 ; SSE-NEXT: pandn %xmm1, %xmm5
3948 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3949 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,2,2,3]
3950 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3951 ; SSE-NEXT: pand %xmm2, %xmm1
3952 ; SSE-NEXT: por %xmm0, %xmm1
3953 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3954 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3955 ; SSE-NEXT: pand %xmm2, %xmm13
3956 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3957 ; SSE-NEXT: pand %xmm2, %xmm12
3958 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3959 ; SSE-NEXT: pand %xmm2, %xmm14
3960 ; SSE-NEXT: pand %xmm2, %xmm8
3961 ; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill
3962 ; SSE-NEXT: movdqa %xmm7, %xmm1
3963 ; SSE-NEXT: pandn %xmm10, %xmm1
3964 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3965 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,2,2,3]
3966 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3967 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3968 ; SSE-NEXT: pand %xmm2, %xmm10
3969 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3970 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3971 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3972 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3973 ; SSE-NEXT: pandn %xmm6, %xmm2
3974 ; SSE-NEXT: por %xmm10, %xmm2
3975 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3976 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
3977 ; SSE-NEXT: movdqa %xmm7, %xmm1
3978 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3979 ; SSE-NEXT: pandn %xmm2, %xmm1
3980 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3981 ; SSE-NEXT: movdqa %xmm2, %xmm5
3982 ; SSE-NEXT: movdqa %xmm2, %xmm3
3983 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3984 ; SSE-NEXT: movdqa %xmm6, %xmm8
3985 ; SSE-NEXT: pslld $16, %xmm8
3986 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3987 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3988 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3989 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3990 ; SSE-NEXT: movdqa %xmm1, %xmm15
3991 ; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
3992 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
3993 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
3994 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3995 ; SSE-NEXT: movdqa %xmm0, %xmm3
3996 ; SSE-NEXT: movdqa %xmm1, %xmm0
3997 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
3998 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3999 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4000 ; SSE-NEXT: pxor %xmm10, %xmm10
4001 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
4002 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7]
4003 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
4004 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,6,4,6,5]
4005 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,0,65535]
4006 ; SSE-NEXT: pand %xmm4, %xmm0
4007 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4008 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4009 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4010 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4011 ; SSE-NEXT: pxor %xmm9, %xmm9
4012 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,7,5,6,7]
4013 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
4014 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,4,6,5]
4015 ; SSE-NEXT: movdqa %xmm4, %xmm0
4016 ; SSE-NEXT: pand %xmm4, %xmm10
4017 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4018 ; SSE-NEXT: pandn %xmm3, %xmm4
4019 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4020 ; SSE-NEXT: pand %xmm0, %xmm2
4021 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4022 ; SSE-NEXT: movdqa %xmm1, %xmm4
4023 ; SSE-NEXT: pand %xmm0, %xmm4
4024 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4025 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4026 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4027 ; SSE-NEXT: movdqa %xmm6, %xmm4
4028 ; SSE-NEXT: pandn %xmm6, %xmm0
4029 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4030 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4031 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4032 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4033 ; SSE-NEXT: pand %xmm7, %xmm0
4034 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4035 ; SSE-NEXT: pand %xmm7, %xmm3
4036 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4037 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4038 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4039 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
4040 ; SSE-NEXT: pand %xmm7, %xmm6
4041 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4042 ; SSE-NEXT: pand %xmm7, %xmm4
4043 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4044 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4045 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4046 ; SSE-NEXT: pandn %xmm1, %xmm7
4047 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4048 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
4049 ; SSE-NEXT: packuswb %xmm1, %xmm1
4050 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
4051 ; SSE-NEXT: movdqa %xmm0, %xmm10
4052 ; SSE-NEXT: pandn %xmm1, %xmm10
4053 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4054 ; SSE-NEXT: # xmm1 = mem[0,3,2,3]
4055 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
4056 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,6,7]
4057 ; SSE-NEXT: packuswb %xmm1, %xmm1
4058 ; SSE-NEXT: pand %xmm0, %xmm1
4059 ; SSE-NEXT: movdqa %xmm0, %xmm2
4060 ; SSE-NEXT: por %xmm1, %xmm10
4061 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4062 ; SSE-NEXT: movdqa %xmm0, %xmm3
4063 ; SSE-NEXT: pandn %xmm10, %xmm3
4064 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4065 ; SSE-NEXT: pand %xmm0, %xmm1
4066 ; SSE-NEXT: movdqa %xmm0, %xmm8
4067 ; SSE-NEXT: por %xmm1, %xmm3
4068 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4069 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535]
4070 ; SSE-NEXT: movdqa %xmm0, %xmm1
4071 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4072 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4073 ; SSE-NEXT: pand %xmm0, %xmm10
4074 ; SSE-NEXT: por %xmm1, %xmm10
4075 ; SSE-NEXT: movdqa %xmm10, %xmm1
4076 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
4077 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,0,65535]
4078 ; SSE-NEXT: movdqa %xmm3, %xmm0
4079 ; SSE-NEXT: pandn %xmm1, %xmm0
4080 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
4081 ; SSE-NEXT: pand %xmm3, %xmm10
4082 ; SSE-NEXT: por %xmm0, %xmm10
4083 ; SSE-NEXT: packuswb %xmm5, %xmm0
4084 ; SSE-NEXT: movdqa %xmm2, %xmm1
4085 ; SSE-NEXT: pandn %xmm0, %xmm1
4086 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,3,2,3]
4087 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4088 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4089 ; SSE-NEXT: packuswb %xmm0, %xmm0
4090 ; SSE-NEXT: pand %xmm2, %xmm0
4091 ; SSE-NEXT: por %xmm0, %xmm1
4092 ; SSE-NEXT: movdqa %xmm8, %xmm0
4093 ; SSE-NEXT: pandn %xmm1, %xmm0
4094 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4095 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4096 ; SSE-NEXT: por %xmm1, %xmm13
4097 ; SSE-NEXT: movdqa %xmm13, %xmm1
4098 ; SSE-NEXT: pxor %xmm6, %xmm6
4099 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
4100 ; SSE-NEXT: movdqa %xmm3, %xmm2
4101 ; SSE-NEXT: pandn %xmm1, %xmm2
4102 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
4103 ; SSE-NEXT: pand %xmm3, %xmm13
4104 ; SSE-NEXT: movdqa %xmm3, %xmm5
4105 ; SSE-NEXT: por %xmm2, %xmm13
4106 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4107 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[1,3,2,3]
4108 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
4109 ; SSE-NEXT: movdqa %xmm11, %xmm2
4110 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4111 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
4112 ; SSE-NEXT: movdqa %xmm1, %xmm10
4113 ; SSE-NEXT: pandn %xmm2, %xmm10
4114 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
4115 ; SSE-NEXT: pand %xmm1, %xmm11
4116 ; SSE-NEXT: por %xmm10, %xmm11
4117 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,1,2,1]
4118 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
4119 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
4120 ; SSE-NEXT: packuswb %xmm2, %xmm3
4121 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
4122 ; SSE-NEXT: movdqa %xmm6, %xmm4
4123 ; SSE-NEXT: pandn %xmm3, %xmm4
4124 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,2,1,3]
4125 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4126 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
4127 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
4128 ; SSE-NEXT: packuswb %xmm2, %xmm2
4129 ; SSE-NEXT: pand %xmm6, %xmm2
4130 ; SSE-NEXT: movdqa %xmm6, %xmm13
4131 ; SSE-NEXT: por %xmm2, %xmm4
4132 ; SSE-NEXT: pand %xmm8, %xmm4
4133 ; SSE-NEXT: por %xmm0, %xmm4
4134 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4135 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,0,65535]
4136 ; SSE-NEXT: movdqa %xmm10, %xmm0
4137 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4138 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4139 ; SSE-NEXT: pand %xmm10, %xmm2
4140 ; SSE-NEXT: por %xmm0, %xmm2
4141 ; SSE-NEXT: movdqa %xmm2, %xmm0
4142 ; SSE-NEXT: pxor %xmm6, %xmm6
4143 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4144 ; SSE-NEXT: movdqa %xmm5, %xmm11
4145 ; SSE-NEXT: movdqa %xmm5, %xmm3
4146 ; SSE-NEXT: pandn %xmm0, %xmm3
4147 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4148 ; SSE-NEXT: pand %xmm5, %xmm2
4149 ; SSE-NEXT: por %xmm3, %xmm2
4150 ; SSE-NEXT: packuswb %xmm15, %xmm0
4151 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,65535,65535,0]
4152 ; SSE-NEXT: movdqa %xmm4, %xmm3
4153 ; SSE-NEXT: pandn %xmm0, %xmm3
4154 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
4155 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4156 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4157 ; SSE-NEXT: packuswb %xmm0, %xmm0
4158 ; SSE-NEXT: pand %xmm4, %xmm0
4159 ; SSE-NEXT: por %xmm0, %xmm3
4160 ; SSE-NEXT: movdqa %xmm8, %xmm0
4161 ; SSE-NEXT: movdqa %xmm8, %xmm15
4162 ; SSE-NEXT: pandn %xmm3, %xmm0
4163 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4164 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4165 ; SSE-NEXT: pandn %xmm5, %xmm2
4166 ; SSE-NEXT: por %xmm2, %xmm12
4167 ; SSE-NEXT: movdqa %xmm12, %xmm2
4168 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4169 ; SSE-NEXT: movdqa %xmm11, %xmm3
4170 ; SSE-NEXT: pandn %xmm2, %xmm3
4171 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
4172 ; SSE-NEXT: pand %xmm11, %xmm12
4173 ; SSE-NEXT: por %xmm3, %xmm12
4174 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4175 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[1,3,2,3]
4176 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4177 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4178 ; SSE-NEXT: movdqa %xmm3, %xmm2
4179 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4180 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
4181 ; SSE-NEXT: pand %xmm1, %xmm3
4182 ; SSE-NEXT: pandn %xmm2, %xmm1
4183 ; SSE-NEXT: por %xmm3, %xmm1
4184 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4185 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4186 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
4187 ; SSE-NEXT: packuswb %xmm2, %xmm1
4188 ; SSE-NEXT: movdqa %xmm13, %xmm2
4189 ; SSE-NEXT: pandn %xmm1, %xmm2
4190 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,1,3]
4191 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
4192 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
4193 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
4194 ; SSE-NEXT: packuswb %xmm1, %xmm1
4195 ; SSE-NEXT: pand %xmm13, %xmm1
4196 ; SSE-NEXT: por %xmm1, %xmm2
4197 ; SSE-NEXT: pand %xmm15, %xmm2
4198 ; SSE-NEXT: por %xmm0, %xmm2
4199 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4200 ; SSE-NEXT: movdqa %xmm10, %xmm0
4201 ; SSE-NEXT: pandn %xmm7, %xmm0
4202 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4203 ; SSE-NEXT: pand %xmm10, %xmm2
4204 ; SSE-NEXT: por %xmm0, %xmm2
4205 ; SSE-NEXT: movdqa %xmm2, %xmm0
4206 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4207 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
4208 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
4209 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
4210 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4211 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4212 ; SSE-NEXT: psrlq $48, %xmm0
4213 ; SSE-NEXT: packuswb %xmm0, %xmm1
4214 ; SSE-NEXT: movdqa %xmm13, %xmm0
4215 ; SSE-NEXT: pandn %xmm1, %xmm0
4216 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,0,65535,65535]
4217 ; SSE-NEXT: movdqa %xmm3, %xmm1
4218 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4219 ; SSE-NEXT: pandn %xmm9, %xmm1
4220 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4221 ; SSE-NEXT: movdqa %xmm7, %xmm2
4222 ; SSE-NEXT: pand %xmm3, %xmm2
4223 ; SSE-NEXT: por %xmm1, %xmm2
4224 ; SSE-NEXT: movdqa %xmm2, %xmm1
4225 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
4226 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
4227 ; SSE-NEXT: movdqa %xmm4, %xmm3
4228 ; SSE-NEXT: pandn %xmm1, %xmm3
4229 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4230 ; SSE-NEXT: pand %xmm4, %xmm2
4231 ; SSE-NEXT: por %xmm3, %xmm2
4232 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
4233 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4234 ; SSE-NEXT: packuswb %xmm1, %xmm1
4235 ; SSE-NEXT: pand %xmm13, %xmm1
4236 ; SSE-NEXT: por %xmm0, %xmm1
4237 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4238 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4239 ; SSE-NEXT: pandn %xmm12, %xmm0
4240 ; SSE-NEXT: por %xmm0, %xmm14
4241 ; SSE-NEXT: movdqa %xmm14, %xmm0
4242 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4243 ; SSE-NEXT: movdqa %xmm11, %xmm2
4244 ; SSE-NEXT: pandn %xmm0, %xmm2
4245 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
4246 ; SSE-NEXT: pand %xmm11, %xmm14
4247 ; SSE-NEXT: por %xmm2, %xmm14
4248 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4249 ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7]
4250 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4251 ; SSE-NEXT: packuswb %xmm0, %xmm0
4252 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
4253 ; SSE-NEXT: movdqa %xmm10, %xmm2
4254 ; SSE-NEXT: pandn %xmm0, %xmm2
4255 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,2,1,0,4,5,6,7]
4256 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
4257 ; SSE-NEXT: packuswb %xmm0, %xmm0
4258 ; SSE-NEXT: pand %xmm10, %xmm0
4259 ; SSE-NEXT: por %xmm0, %xmm2
4260 ; SSE-NEXT: movdqa %xmm15, %xmm0
4261 ; SSE-NEXT: pandn %xmm2, %xmm0
4262 ; SSE-NEXT: pand %xmm15, %xmm1
4263 ; SSE-NEXT: por %xmm1, %xmm0
4264 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4265 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,65535,0,65535]
4266 ; SSE-NEXT: movdqa %xmm11, %xmm0
4267 ; SSE-NEXT: pandn %xmm8, %xmm0
4268 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4269 ; SSE-NEXT: pand %xmm11, %xmm1
4270 ; SSE-NEXT: por %xmm0, %xmm1
4271 ; SSE-NEXT: movdqa %xmm1, %xmm0
4272 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
4273 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
4274 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
4275 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
4276 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4277 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
4278 ; SSE-NEXT: psrlq $48, %xmm0
4279 ; SSE-NEXT: packuswb %xmm0, %xmm1
4280 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,65535,0,65535,65535]
4281 ; SSE-NEXT: movdqa %xmm8, %xmm0
4282 ; SSE-NEXT: movdqa %xmm5, %xmm11
4283 ; SSE-NEXT: pandn %xmm5, %xmm0
4284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4285 ; SSE-NEXT: movdqa %xmm3, %xmm2
4286 ; SSE-NEXT: pand %xmm8, %xmm2
4287 ; SSE-NEXT: por %xmm0, %xmm2
4288 ; SSE-NEXT: movdqa %xmm2, %xmm0
4289 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4290 ; SSE-NEXT: movdqa %xmm4, %xmm5
4291 ; SSE-NEXT: pandn %xmm0, %xmm5
4292 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
4293 ; SSE-NEXT: pand %xmm4, %xmm2
4294 ; SSE-NEXT: por %xmm5, %xmm2
4295 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7]
4296 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4297 ; SSE-NEXT: packuswb %xmm0, %xmm0
4298 ; SSE-NEXT: pand %xmm13, %xmm0
4299 ; SSE-NEXT: pandn %xmm1, %xmm13
4300 ; SSE-NEXT: por %xmm13, %xmm0
4301 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4302 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4303 ; SSE-NEXT: pandn %xmm8, %xmm1
4304 ; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload
4305 ; SSE-NEXT: por %xmm1, %xmm5
4306 ; SSE-NEXT: movdqa %xmm5, %xmm1
4307 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
4308 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
4309 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535]
4310 ; SSE-NEXT: pand %xmm2, %xmm5
4311 ; SSE-NEXT: pandn %xmm1, %xmm2
4312 ; SSE-NEXT: por %xmm5, %xmm2
4313 ; SSE-NEXT: movdqa %xmm2, %xmm5
4314 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4315 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,1,2,3,4,5,6,7]
4316 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4317 ; SSE-NEXT: packuswb %xmm1, %xmm1
4318 ; SSE-NEXT: movdqa %xmm10, %xmm2
4319 ; SSE-NEXT: pandn %xmm1, %xmm2
4320 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,1,0,4,5,6,7]
4321 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
4322 ; SSE-NEXT: packuswb %xmm1, %xmm1
4323 ; SSE-NEXT: pand %xmm10, %xmm1
4324 ; SSE-NEXT: por %xmm1, %xmm2
4325 ; SSE-NEXT: movdqa %xmm15, %xmm1
4326 ; SSE-NEXT: pandn %xmm2, %xmm1
4327 ; SSE-NEXT: pand %xmm15, %xmm0
4328 ; SSE-NEXT: movdqa %xmm15, %xmm14
4329 ; SSE-NEXT: por %xmm0, %xmm1
4330 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
4331 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,65535,0,65535,65535]
4332 ; SSE-NEXT: movdqa %xmm15, %xmm0
4333 ; SSE-NEXT: pandn %xmm9, %xmm0
4334 ; SSE-NEXT: movdqa %xmm7, %xmm2
4335 ; SSE-NEXT: pand %xmm15, %xmm2
4336 ; SSE-NEXT: por %xmm0, %xmm2
4337 ; SSE-NEXT: movdqa %xmm2, %xmm0
4338 ; SSE-NEXT: pxor %xmm1, %xmm1
4339 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4340 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
4341 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4342 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
4343 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
4344 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4346 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4347 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4348 ; SSE-NEXT: pandn %xmm0, %xmm6
4349 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4350 ; SSE-NEXT: por %xmm6, %xmm5
4351 ; SSE-NEXT: packuswb %xmm0, %xmm5
4352 ; SSE-NEXT: packuswb %xmm2, %xmm2
4353 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,1,3,3]
4354 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
4355 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,65535,65535,65535,0,65535,65535]
4356 ; SSE-NEXT: movdqa %xmm9, %xmm2
4357 ; SSE-NEXT: pandn %xmm12, %xmm2
4358 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4359 ; SSE-NEXT: movdqa %xmm7, %xmm5
4360 ; SSE-NEXT: pand %xmm9, %xmm5
4361 ; SSE-NEXT: por %xmm2, %xmm5
4362 ; SSE-NEXT: movdqa %xmm5, %xmm2
4363 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4364 ; SSE-NEXT: movdqa %xmm4, %xmm6
4365 ; SSE-NEXT: pandn %xmm2, %xmm6
4366 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4367 ; SSE-NEXT: pand %xmm4, %xmm5
4368 ; SSE-NEXT: por %xmm6, %xmm5
4369 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4370 ; SSE-NEXT: # xmm2 = mem[0,1,2,1]
4371 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4372 ; SSE-NEXT: packuswb %xmm2, %xmm2
4373 ; SSE-NEXT: movdqa %xmm10, %xmm6
4374 ; SSE-NEXT: pandn %xmm2, %xmm6
4375 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
4376 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
4377 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
4378 ; SSE-NEXT: packuswb %xmm2, %xmm2
4379 ; SSE-NEXT: pand %xmm10, %xmm2
4380 ; SSE-NEXT: por %xmm2, %xmm6
4381 ; SSE-NEXT: movdqa %xmm14, %xmm1
4382 ; SSE-NEXT: pandn %xmm6, %xmm14
4383 ; SSE-NEXT: andps %xmm1, %xmm0
4384 ; SSE-NEXT: movdqa %xmm1, %xmm6
4385 ; SSE-NEXT: por %xmm0, %xmm14
4386 ; SSE-NEXT: movdqa %xmm15, %xmm1
4387 ; SSE-NEXT: movdqa %xmm15, %xmm0
4388 ; SSE-NEXT: movdqa %xmm11, %xmm15
4389 ; SSE-NEXT: pandn %xmm11, %xmm0
4390 ; SSE-NEXT: pand %xmm1, %xmm3
4391 ; SSE-NEXT: por %xmm0, %xmm3
4392 ; SSE-NEXT: movdqa %xmm3, %xmm0
4393 ; SSE-NEXT: pxor %xmm1, %xmm1
4394 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4395 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
4396 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
4397 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3]
4398 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
4399 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4400 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4401 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4402 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4403 ; SSE-NEXT: pandn %xmm0, %xmm3
4404 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4405 ; SSE-NEXT: por %xmm3, %xmm2
4406 ; SSE-NEXT: packuswb %xmm0, %xmm2
4407 ; SSE-NEXT: packuswb %xmm5, %xmm5
4408 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
4409 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
4410 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4411 ; SSE-NEXT: movdqa %xmm2, %xmm5
4412 ; SSE-NEXT: movdqa %xmm2, %xmm3
4413 ; SSE-NEXT: movdqa %xmm9, %xmm2
4414 ; SSE-NEXT: pand %xmm9, %xmm5
4415 ; SSE-NEXT: pandn %xmm8, %xmm2
4416 ; SSE-NEXT: movdqa %xmm8, %xmm9
4417 ; SSE-NEXT: por %xmm5, %xmm2
4418 ; SSE-NEXT: movdqa %xmm2, %xmm5
4419 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
4420 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
4421 ; SSE-NEXT: pand %xmm4, %xmm2
4422 ; SSE-NEXT: pandn %xmm5, %xmm4
4423 ; SSE-NEXT: por %xmm2, %xmm4
4424 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
4425 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
4426 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4427 ; SSE-NEXT: packuswb %xmm4, %xmm4
4428 ; SSE-NEXT: pand %xmm10, %xmm4
4429 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,1,2,1]
4430 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
4431 ; SSE-NEXT: packuswb %xmm5, %xmm5
4432 ; SSE-NEXT: pandn %xmm5, %xmm10
4433 ; SSE-NEXT: por %xmm4, %xmm10
4434 ; SSE-NEXT: movdqa %xmm6, %xmm4
4435 ; SSE-NEXT: pandn %xmm10, %xmm4
4436 ; SSE-NEXT: andps %xmm6, %xmm0
4437 ; SSE-NEXT: por %xmm0, %xmm4
4438 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4439 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
4440 ; SSE-NEXT: pand %xmm2, %xmm0
4441 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4442 ; SSE-NEXT: movdqa %xmm0, %xmm5
4443 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4444 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0]
4445 ; SSE-NEXT: movdqa %xmm10, %xmm6
4446 ; SSE-NEXT: pandn %xmm5, %xmm6
4447 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4448 ; SSE-NEXT: pand %xmm10, %xmm0
4449 ; SSE-NEXT: por %xmm6, %xmm0
4450 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4451 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6]
4452 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4453 ; SSE-NEXT: packuswb %xmm5, %xmm8
4454 ; SSE-NEXT: movdqa %xmm2, %xmm11
4455 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4456 ; SSE-NEXT: pandn %xmm0, %xmm11
4457 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
4458 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4459 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
4460 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
4461 ; SSE-NEXT: movdqa %xmm5, %xmm0
4462 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
4463 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4464 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4465 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
4466 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
4467 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4468 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4469 ; SSE-NEXT: packuswb %xmm5, %xmm5
4470 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm5[0],xmm8[1,2,3]
4471 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,65535,0,65535,65535]
4472 ; SSE-NEXT: movdqa %xmm6, %xmm0
4473 ; SSE-NEXT: pandn %xmm12, %xmm0
4474 ; SSE-NEXT: movdqa %xmm7, %xmm5
4475 ; SSE-NEXT: pand %xmm6, %xmm5
4476 ; SSE-NEXT: por %xmm0, %xmm5
4477 ; SSE-NEXT: movdqa %xmm5, %xmm0
4478 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4479 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,65535,65535,65535]
4480 ; SSE-NEXT: movdqa %xmm6, %xmm7
4481 ; SSE-NEXT: pandn %xmm0, %xmm7
4482 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
4483 ; SSE-NEXT: pand %xmm6, %xmm5
4484 ; SSE-NEXT: por %xmm7, %xmm5
4485 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4486 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4487 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4488 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
4489 ; SSE-NEXT: packuswb %xmm0, %xmm0
4490 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
4491 ; SSE-NEXT: movdqa %xmm7, %xmm12
4492 ; SSE-NEXT: pandn %xmm0, %xmm12
4493 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,0,3]
4494 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
4495 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4496 ; SSE-NEXT: packuswb %xmm0, %xmm0
4497 ; SSE-NEXT: pand %xmm7, %xmm0
4498 ; SSE-NEXT: por %xmm0, %xmm12
4499 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4500 ; SSE-NEXT: movdqa %xmm0, %xmm5
4501 ; SSE-NEXT: pandn %xmm12, %xmm5
4502 ; SSE-NEXT: andps %xmm0, %xmm8
4503 ; SSE-NEXT: por %xmm8, %xmm5
4504 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4505 ; SSE-NEXT: pand %xmm2, %xmm0
4506 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4507 ; SSE-NEXT: movdqa %xmm0, %xmm12
4508 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
4509 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4510 ; SSE-NEXT: pand %xmm10, %xmm0
4511 ; SSE-NEXT: pandn %xmm12, %xmm10
4512 ; SSE-NEXT: por %xmm0, %xmm10
4513 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,1,0,3]
4514 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,5,4,7,6]
4515 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4516 ; SSE-NEXT: packuswb %xmm12, %xmm8
4517 ; SSE-NEXT: movdqa %xmm13, %xmm12
4518 ; SSE-NEXT: pand %xmm2, %xmm12
4519 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4520 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,2,2,3]
4521 ; SSE-NEXT: pand %xmm2, %xmm10
4522 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4523 ; SSE-NEXT: pandn %xmm15, %xmm2
4524 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm15[1,3,2,3]
4525 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
4526 ; SSE-NEXT: movdqa %xmm0, %xmm10
4527 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
4528 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
4529 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
4530 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4531 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4532 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
4533 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4534 ; SSE-NEXT: packuswb %xmm0, %xmm0
4535 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm0[0],xmm8[1,2,3]
4536 ; SSE-NEXT: movdqa %xmm3, %xmm13
4537 ; SSE-NEXT: movdqa %xmm3, %xmm0
4538 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,0,65535,65535,0,65535,65535]
4539 ; SSE-NEXT: pand %xmm3, %xmm0
4540 ; SSE-NEXT: pandn %xmm9, %xmm3
4541 ; SSE-NEXT: movdqa %xmm9, %xmm15
4542 ; SSE-NEXT: por %xmm0, %xmm3
4543 ; SSE-NEXT: movdqa %xmm3, %xmm0
4544 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4545 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
4546 ; SSE-NEXT: pand %xmm6, %xmm3
4547 ; SSE-NEXT: pandn %xmm0, %xmm6
4548 ; SSE-NEXT: por %xmm3, %xmm6
4549 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4550 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4551 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4552 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
4553 ; SSE-NEXT: packuswb %xmm0, %xmm0
4554 ; SSE-NEXT: movdqa %xmm7, %xmm9
4555 ; SSE-NEXT: pandn %xmm0, %xmm9
4556 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,1,0,3]
4557 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,3,2,4,5,6,7]
4558 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
4559 ; SSE-NEXT: packuswb %xmm0, %xmm0
4560 ; SSE-NEXT: pand %xmm7, %xmm0
4561 ; SSE-NEXT: por %xmm0, %xmm9
4562 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
4563 ; SSE-NEXT: movdqa %xmm3, %xmm6
4564 ; SSE-NEXT: pandn %xmm9, %xmm6
4565 ; SSE-NEXT: andps %xmm3, %xmm8
4566 ; SSE-NEXT: por %xmm8, %xmm6
4567 ; SSE-NEXT: movdqa %xmm12, %xmm1
4568 ; SSE-NEXT: por %xmm11, %xmm1
4569 ; SSE-NEXT: movdqa %xmm1, %xmm0
4570 ; SSE-NEXT: pxor %xmm9, %xmm9
4571 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4572 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
4573 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
4574 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,1,2,1]
4575 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
4576 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
4577 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4578 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
4579 ; SSE-NEXT: pxor %xmm1, %xmm1
4580 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4581 ; SSE-NEXT: pandn %xmm8, %xmm10
4582 ; SSE-NEXT: movdqa %xmm8, %xmm9
4583 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4584 ; SSE-NEXT: por %xmm10, %xmm8
4585 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
4586 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
4587 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
4588 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,7,6,5]
4589 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[3,3,3,3]
4590 ; SSE-NEXT: packuswb %xmm8, %xmm10
4591 ; SSE-NEXT: packuswb %xmm0, %xmm0
4592 ; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3]
4593 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4594 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4595 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4596 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4597 ; SSE-NEXT: packuswb %xmm0, %xmm0
4598 ; SSE-NEXT: movdqa %xmm7, %xmm8
4599 ; SSE-NEXT: pandn %xmm0, %xmm8
4600 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4601 ; SSE-NEXT: # xmm0 = mem[1,3,2,3]
4602 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4603 ; SSE-NEXT: # xmm11 = mem[0,2,2,3]
4604 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
4605 ; SSE-NEXT: movdqa %xmm11, %xmm0
4606 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4607 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,0,65535,65535]
4608 ; SSE-NEXT: movdqa %xmm9, %xmm12
4609 ; SSE-NEXT: pandn %xmm0, %xmm12
4610 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
4611 ; SSE-NEXT: pand %xmm9, %xmm11
4612 ; SSE-NEXT: por %xmm12, %xmm11
4613 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,1,1,1]
4614 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4615 ; SSE-NEXT: packuswb %xmm0, %xmm0
4616 ; SSE-NEXT: pand %xmm7, %xmm0
4617 ; SSE-NEXT: por %xmm8, %xmm0
4618 ; SSE-NEXT: movaps %xmm3, %xmm1
4619 ; SSE-NEXT: movdqa %xmm3, %xmm8
4620 ; SSE-NEXT: pandn %xmm0, %xmm8
4621 ; SSE-NEXT: andps %xmm3, %xmm10
4622 ; SSE-NEXT: por %xmm10, %xmm8
4623 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4624 ; SSE-NEXT: movdqa %xmm2, %xmm0
4625 ; SSE-NEXT: pxor %xmm11, %xmm11
4626 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
4627 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
4628 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15]
4629 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,1,2,1]
4630 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,7,6,7]
4631 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
4632 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4633 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm11[8],xmm2[9],xmm11[9],xmm2[10],xmm11[10],xmm2[11],xmm11[11],xmm2[12],xmm11[12],xmm2[13],xmm11[13],xmm2[14],xmm11[14],xmm2[15],xmm11[15]
4634 ; SSE-NEXT: pxor %xmm12, %xmm12
4635 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4636 ; SSE-NEXT: pandn %xmm2, %xmm10
4637 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4638 ; SSE-NEXT: por %xmm10, %xmm3
4639 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,2,1,3]
4640 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
4641 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
4642 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,5]
4643 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3]
4644 ; SSE-NEXT: packuswb %xmm11, %xmm10
4645 ; SSE-NEXT: packuswb %xmm0, %xmm0
4646 ; SSE-NEXT: movss {{.*#+}} xmm10 = xmm0[0],xmm10[1,2,3]
4647 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4648 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4649 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,3,2,3]
4650 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3]
4651 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
4652 ; SSE-NEXT: movdqa %xmm11, %xmm0
4653 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
4654 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
4655 ; SSE-NEXT: pand %xmm9, %xmm11
4656 ; SSE-NEXT: pandn %xmm0, %xmm9
4657 ; SSE-NEXT: por %xmm11, %xmm9
4658 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,1,1,1]
4659 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
4660 ; SSE-NEXT: packuswb %xmm0, %xmm0
4661 ; SSE-NEXT: pand %xmm7, %xmm0
4662 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,1,0,3]
4663 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
4664 ; SSE-NEXT: packuswb %xmm9, %xmm9
4665 ; SSE-NEXT: pandn %xmm9, %xmm7
4666 ; SSE-NEXT: por %xmm7, %xmm0
4667 ; SSE-NEXT: andps %xmm1, %xmm10
4668 ; SSE-NEXT: andnps %xmm0, %xmm1
4669 ; SSE-NEXT: orps %xmm10, %xmm1
4670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4671 ; SSE-NEXT: movaps %xmm0, (%rsi)
4672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4673 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
4674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4675 ; SSE-NEXT: movaps %xmm0, (%rdx)
4676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4677 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
4678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4679 ; SSE-NEXT: movaps %xmm0, (%rcx)
4680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4681 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
4682 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
4683 ; SSE-NEXT: movaps %xmm0, (%r8)
4684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4685 ; SSE-NEXT: movaps %xmm0, 16(%r8)
4686 ; SSE-NEXT: movdqa %xmm4, (%r9)
4687 ; SSE-NEXT: movdqa %xmm14, 16(%r9)
4688 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4689 ; SSE-NEXT: movdqa %xmm6, (%rax)
4690 ; SSE-NEXT: movdqa %xmm5, 16(%rax)
4691 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4692 ; SSE-NEXT: movaps %xmm1, (%rax)
4693 ; SSE-NEXT: movdqa %xmm8, 16(%rax)
4694 ; SSE-NEXT: addq $648, %rsp # imm = 0x288
4697 ; AVX-LABEL: load_i8_stride7_vf32:
4699 ; AVX-NEXT: subq $200, %rsp
4700 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm7
4701 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13,u,u,u,u]
4702 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm6
4703 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm6[u,u,u,u]
4704 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1
4705 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm8
4706 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u],zero,zero,xmm8[3,10,u,u,u,u,u,u,u,u,u]
4707 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm9
4708 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4709 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2
4710 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
4711 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4712 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4713 ; AVX-NEXT: vmovdqa (%rdi), %xmm3
4714 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm11
4715 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm15
4716 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
4717 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u,u,u,u,u]
4718 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[0,7,14],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u]
4719 ; AVX-NEXT: vmovdqa %xmm3, %xmm10
4720 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
4721 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
4722 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
4723 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4724 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4725 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
4726 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4727 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
4728 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14,u,u,u,u]
4729 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
4730 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[4,11,u,u,u,u,u,u,u,u,u]
4731 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,6,13],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4732 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4733 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
4734 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4735 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u,u,u,u,u]
4736 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[1,8,15],zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4737 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1
4738 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
4739 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
4740 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2
4741 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4742 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4743 ; AVX-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
4744 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
4745 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4746 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
4747 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
4748 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
4749 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4750 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
4751 ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
4752 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4753 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4754 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4755 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
4756 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
4757 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4758 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10,u,u,u,u,u,u,u]
4759 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,5,12],zero,zero,xmm15[u,u,u,u,u,u,u]
4760 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4761 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
4762 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4763 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
4764 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm11[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
4765 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
4766 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
4767 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
4768 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4769 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
4770 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4771 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u]
4772 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4773 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm1
4774 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
4775 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
4776 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm2
4777 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
4778 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
4779 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4780 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
4781 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
4782 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10,u,u,u,u,u]
4783 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u,u,u]
4784 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
4785 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm3, %xmm3
4786 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,3,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
4787 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u],zero,zero,xmm8[1,8,15,u,u,u,u,u,u,u,u,u]
4788 ; AVX-NEXT: vpor %xmm1, %xmm4, %xmm1
4789 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[4,11,u,u,u,u,u]
4790 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,6,13],zero,zero,xmm6[u,u,u,u,u]
4791 ; AVX-NEXT: vpor %xmm4, %xmm12, %xmm4
4792 ; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm4, %xmm4
4793 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm5
4794 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
4795 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12]
4796 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
4797 ; AVX-NEXT: vpor %xmm12, %xmm13, %xmm13
4798 ; AVX-NEXT: vpmovsxdq {{.*#+}} xmm12 = [18446744073709486080,16777215]
4799 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm13, %xmm0
4800 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4801 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13]
4802 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
4803 ; AVX-NEXT: vpor %xmm2, %xmm14, %xmm2
4804 ; AVX-NEXT: vpblendvb %xmm12, %xmm3, %xmm2, %xmm0
4805 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4806 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
4807 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[0,7,14]
4808 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
4809 ; AVX-NEXT: vpblendvb %xmm12, %xmm4, %xmm2, %xmm0
4810 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4811 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
4812 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,4,11,u,u,u,u,u,u,u,u,u,u,u,u]
4813 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4814 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
4815 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[u,u,u,u,u]
4816 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
4817 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3,4,5,6,7]
4818 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
4819 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm1[1,8,15]
4820 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
4821 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm3, %xmm0
4822 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4823 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm10 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
4824 ; AVX-NEXT: vpshufb %xmm10, %xmm5, %xmm2
4825 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
4826 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
4827 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
4828 ; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm2[6,7]
4829 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
4830 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm6
4831 ; AVX-NEXT: vpshufb %xmm7, %xmm6, %xmm2
4832 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm9
4833 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,4,11,u,u,u,u,u,u,u,u,u,u]
4834 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
4835 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm8
4836 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9]
4837 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,2,3,8,15]
4838 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
4839 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[0,7,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
4840 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
4841 ; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4842 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
4843 ; AVX-NEXT: vandnps %ymm12, %ymm13, %ymm12
4844 ; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0
4845 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4846 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
4847 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
4848 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
4849 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
4850 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4851 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
4852 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
4853 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4854 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
4855 ; AVX-NEXT: # xmm4 = mem[0,1,2,3,4,5],xmm0[6,7]
4856 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
4857 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
4858 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
4859 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
4860 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,6],xmm3[7]
4861 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[3,10]
4862 ; AVX-NEXT: vpor %xmm14, %xmm13, %xmm13
4863 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
4864 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
4865 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4866 ; AVX-NEXT: vandnps (%rsp), %ymm14, %ymm0 # 32-byte Folded Reload
4867 ; AVX-NEXT: vandps %ymm14, %ymm13, %ymm13
4868 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0
4869 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4870 ; AVX-NEXT: vandnps %ymm4, %ymm12, %ymm4
4871 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
4872 ; AVX-NEXT: vorps %ymm4, %ymm0, %ymm0
4873 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4874 ; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm0
4875 ; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm1
4876 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4877 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4878 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
4879 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
4880 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15]
4881 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4882 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
4883 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11]
4884 ; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1
4885 ; AVX-NEXT: vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4886 ; AVX-NEXT: vpshufb %xmm12, %xmm2, %xmm4
4887 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4
4888 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4889 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
4890 ; AVX-NEXT: vandnps %ymm4, %ymm1, %ymm4
4891 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4
4892 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5
4893 ; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
4894 ; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5
4895 ; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4
4896 ; AVX-NEXT: vorps %ymm5, %ymm4, %ymm0
4897 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4898 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u]
4899 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14,u,u]
4900 ; AVX-NEXT: vpor %xmm4, %xmm7, %xmm4
4901 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7]
4902 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12]
4903 ; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7
4904 ; AVX-NEXT: vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4905 ; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm10
4906 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
4907 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
4908 ; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
4909 ; AVX-NEXT: vorps %ymm7, %ymm10, %ymm7
4910 ; AVX-NEXT: vandps %ymm7, %ymm13, %ymm7
4911 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 16-byte Folded Reload
4912 ; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10
4913 ; AVX-NEXT: vorps %ymm7, %ymm10, %ymm0
4914 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
4915 ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm7
4916 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4917 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4918 ; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm10
4919 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
4920 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4921 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u]
4922 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
4923 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
4924 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3,4,5,6,7]
4925 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u]
4926 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15,u,u]
4927 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
4928 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5,6],xmm3[7]
4929 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[6,13]
4930 ; AVX-NEXT: vpor %xmm3, %xmm10, %xmm3
4931 ; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm10
4932 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3
4933 ; AVX-NEXT: vandps %ymm1, %ymm7, %ymm7
4934 ; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3
4935 ; AVX-NEXT: vorps %ymm3, %ymm7, %ymm3
4936 ; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3
4937 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
4938 ; AVX-NEXT: vandnps %ymm7, %ymm13, %ymm7
4939 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3
4940 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4941 ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4
4942 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4943 ; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10
4944 ; AVX-NEXT: vmovdqa %xmm5, %xmm3
4945 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
4946 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u],zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u]
4947 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,u,u,1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
4948 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
4949 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3,4,5,6,7]
4950 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[2,9,u,u,u]
4951 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u]
4952 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
4953 ; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
4954 ; AVX-NEXT: # xmm12 = mem[0,0]
4955 ; AVX-NEXT: vpshufb %xmm12, %xmm10, %xmm10
4956 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[0,7,14]
4957 ; AVX-NEXT: vpor %xmm5, %xmm10, %xmm5
4958 ; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm7
4959 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
4960 ; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4
4961 ; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm5
4962 ; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4
4963 ; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4
4964 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
4965 ; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5
4966 ; AVX-NEXT: vorps %ymm5, %ymm4, %ymm4
4967 ; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm5
4968 ; AVX-NEXT: vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4969 ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm10
4970 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
4971 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u]
4972 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
4973 ; AVX-NEXT: vpor %xmm10, %xmm11, %xmm10
4974 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3,4,5,6,7]
4975 ; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm2
4976 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,xmm9[3,10,u,u,u]
4977 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm6[u,u,u]
4978 ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6
4979 ; AVX-NEXT: vpshufb %xmm12, %xmm6, %xmm6
4980 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm8[1,8,15]
4981 ; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6
4982 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2
4983 ; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5
4984 ; AVX-NEXT: vandnps %ymm2, %ymm1, %ymm1
4985 ; AVX-NEXT: vorps %ymm1, %ymm5, %ymm1
4986 ; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1
4987 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
4988 ; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm0
4989 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4990 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4991 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
4992 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4993 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
4994 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4995 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
4996 ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
4997 ; AVX-NEXT: vmovaps %ymm1, (%r8)
4998 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4999 ; AVX-NEXT: vmovaps %ymm1, (%r9)
5000 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5001 ; AVX-NEXT: vmovaps %ymm4, (%rax)
5002 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5003 ; AVX-NEXT: vmovaps %ymm0, (%rax)
5004 ; AVX-NEXT: addq $200, %rsp
5005 ; AVX-NEXT: vzeroupper
5008 ; AVX2-LABEL: load_i8_stride7_vf32:
5010 ; AVX2-NEXT: subq $72, %rsp
5011 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm10
5012 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm11
5013 ; AVX2-NEXT: vmovdqa (%rdi), %ymm6
5014 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7
5015 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm13
5016 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
5017 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5018 ; AVX2-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
5019 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5020 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5021 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5022 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
5023 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5024 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1
5025 ; AVX2-NEXT: vmovdqa %ymm2, %ymm14
5026 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
5027 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
5028 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5029 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5030 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
5031 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5032 ; AVX2-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
5033 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
5034 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5035 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5036 ; AVX2-NEXT: vpor %xmm4, %xmm1, %xmm1
5037 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5038 ; AVX2-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
5039 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
5040 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
5041 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
5042 ; AVX2-NEXT: vpor %xmm5, %xmm4, %xmm4
5043 ; AVX2-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5
5044 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm8
5045 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
5046 ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5047 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0]
5048 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
5049 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm4
5050 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5051 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5
5052 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5053 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
5054 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5055 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
5056 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
5057 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0]
5058 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
5059 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5060 ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0
5061 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u]
5062 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5063 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u]
5064 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
5065 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5066 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5067 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5068 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5069 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5070 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5071 ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0
5072 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5073 ; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0
5074 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5075 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5076 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5077 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
5078 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5079 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5080 ; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1
5081 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
5082 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5083 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5084 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
5085 ; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
5086 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5087 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5088 ; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
5089 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
5090 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
5091 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
5092 ; AVX2-NEXT: vpor %xmm1, %xmm8, %xmm1
5093 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5094 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5095 ; AVX2-NEXT: vpor %xmm8, %xmm12, %xmm8
5096 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5097 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5098 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
5099 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5100 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5101 ; AVX2-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
5102 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
5103 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
5104 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
5105 ; AVX2-NEXT: vpor %xmm1, %xmm12, %xmm1
5106 ; AVX2-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5107 ; AVX2-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5108 ; AVX2-NEXT: vpor %xmm12, %xmm14, %xmm12
5109 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5110 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5111 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
5112 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5113 ; AVX2-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
5114 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5115 ; AVX2-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
5116 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11
5117 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
5118 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
5119 ; AVX2-NEXT: vpor %xmm11, %xmm10, %xmm10
5120 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5121 ; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5122 ; AVX2-NEXT: vpor %xmm11, %xmm15, %xmm11
5123 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5124 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
5125 ; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
5126 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5127 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5128 ; AVX2-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
5129 ; AVX2-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
5130 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
5131 ; AVX2-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
5132 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5133 ; AVX2-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
5134 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5135 ; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
5136 ; AVX2-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
5137 ; AVX2-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
5138 ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7
5139 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2
5140 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
5141 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm11
5142 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
5143 ; AVX2-NEXT: vpor %xmm3, %xmm11, %xmm3
5144 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11
5145 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15]
5146 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5147 ; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
5148 ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0
5149 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
5150 ; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm13
5151 ; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
5152 ; AVX2-NEXT: vpor %xmm3, %xmm13, %xmm3
5153 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm13
5154 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15]
5155 ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5156 ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3
5157 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm9
5158 ; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
5159 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
5160 ; AVX2-NEXT: vpor %xmm1, %xmm9, %xmm1
5161 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
5162 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15]
5163 ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5164 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9
5165 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1
5166 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u]
5167 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
5168 ; AVX2-NEXT: vpor %xmm1, %xmm6, %xmm1
5169 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm6
5170 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
5171 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5172 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1
5173 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
5174 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm7
5175 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u]
5176 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
5177 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
5178 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
5179 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5180 ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2
5181 ; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm6
5182 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5183 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5184 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
5185 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5186 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5187 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
5188 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
5189 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5190 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5191 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
5192 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5193 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
5194 ; AVX2-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
5195 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5196 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
5197 ; AVX2-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
5198 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
5199 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
5200 ; AVX2-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
5201 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5202 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
5203 ; AVX2-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
5204 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5205 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5206 ; AVX2-NEXT: vmovaps %ymm5, (%rsi)
5207 ; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
5208 ; AVX2-NEXT: vmovaps %ymm5, (%rdx)
5209 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
5210 ; AVX2-NEXT: vmovdqa %ymm3, (%r8)
5211 ; AVX2-NEXT: vmovdqa %ymm4, (%r9)
5212 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5213 ; AVX2-NEXT: vmovdqa %ymm1, (%rax)
5214 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5215 ; AVX2-NEXT: vmovdqa %ymm2, (%rax)
5216 ; AVX2-NEXT: addq $72, %rsp
5217 ; AVX2-NEXT: vzeroupper
5220 ; AVX2-FP-LABEL: load_i8_stride7_vf32:
5222 ; AVX2-FP-NEXT: subq $72, %rsp
5223 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm10
5224 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm11
5225 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6
5226 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7
5227 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm13
5228 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3
5229 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5230 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
5231 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
5232 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5233 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5234 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
5235 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5236 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm1
5237 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14
5238 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
5239 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
5240 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5241 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5242 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
5243 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5244 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
5245 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
5246 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5247 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5248 ; AVX2-FP-NEXT: vpor %xmm4, %xmm1, %xmm1
5249 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5250 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
5251 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
5252 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
5253 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u,u,u,u,u]
5254 ; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
5255 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm3, %ymm5
5256 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm8
5257 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
5258 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5259 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm8 = [0,18446744073709551360,16777215,0]
5260 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
5261 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm4
5262 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5263 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5
5264 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5265 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3]
5266 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5267 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
5268 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7]
5269 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,16777215,0]
5270 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
5271 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5272 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm10, %ymm11, %ymm0
5273 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u]
5274 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
5275 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u]
5276 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
5277 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5278 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5279 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
5280 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5281 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5282 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5283 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0
5284 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5285 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm10, %ymm0
5286 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
5287 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5288 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5289 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
5290 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5291 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5292 ; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1
5293 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
5294 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5295 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5296 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1]
5297 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
5298 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5299 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5300 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
5301 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
5302 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
5303 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
5304 ; AVX2-FP-NEXT: vpor %xmm1, %xmm8, %xmm1
5305 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5306 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5307 ; AVX2-FP-NEXT: vpor %xmm8, %xmm12, %xmm8
5308 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5309 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5310 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
5311 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5312 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5313 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
5314 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
5315 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
5316 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
5317 ; AVX2-FP-NEXT: vpor %xmm1, %xmm12, %xmm1
5318 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5319 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5320 ; AVX2-FP-NEXT: vpor %xmm12, %xmm14, %xmm12
5321 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5322 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5323 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
5324 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5325 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
5326 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5327 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
5328 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
5329 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
5330 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
5331 ; AVX2-FP-NEXT: vpor %xmm11, %xmm10, %xmm10
5332 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5333 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5334 ; AVX2-FP-NEXT: vpor %xmm11, %xmm15, %xmm11
5335 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5336 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
5337 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
5338 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5339 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5340 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
5341 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
5342 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
5343 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
5344 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5345 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
5346 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5347 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
5348 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
5349 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
5350 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm7
5351 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm13, %ymm2
5352 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
5353 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11
5354 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
5355 ; AVX2-FP-NEXT: vpor %xmm3, %xmm11, %xmm3
5356 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11
5357 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7,8,9,10],ymm11[11],ymm0[12,13],ymm11[14],ymm0[15]
5358 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5359 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
5360 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm0, %ymm0
5361 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
5362 ; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm13
5363 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u]
5364 ; AVX2-FP-NEXT: vpor %xmm3, %xmm13, %xmm3
5365 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm13
5366 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4,5,6],ymm13[7,8],ymm9[9,10],ymm13[11],ymm9[12,13,14],ymm13[15]
5367 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5368 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm9, %ymm3
5369 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm9
5370 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
5371 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u,u,u]
5372 ; AVX2-FP-NEXT: vpor %xmm1, %xmm9, %xmm1
5373 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
5374 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3],ymm9[4],ymm8[5,6],ymm9[7,8],ymm8[9,10,11],ymm9[12],ymm8[13,14],ymm9[15]
5375 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5376 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm8, %ymm9
5377 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1
5378 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[3,10],zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u]
5379 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
5380 ; AVX2-FP-NEXT: vpor %xmm1, %xmm6, %xmm1
5381 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm6
5382 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
5383 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5384 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm6, %ymm1
5385 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
5386 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm7
5387 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u]
5388 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
5389 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7
5390 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
5391 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5392 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm2
5393 ; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm6
5394 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5395 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5396 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
5397 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
5398 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5399 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
5400 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
5401 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5402 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5403 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
5404 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5405 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
5406 ; AVX2-FP-NEXT: # ymm4 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
5407 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
5408 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
5409 ; AVX2-FP-NEXT: # ymm4 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
5410 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
5411 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
5412 ; AVX2-FP-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
5413 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5414 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
5415 ; AVX2-FP-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
5416 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
5417 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5418 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi)
5419 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
5420 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx)
5421 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
5422 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8)
5423 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9)
5424 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5425 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
5426 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5427 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
5428 ; AVX2-FP-NEXT: addq $72, %rsp
5429 ; AVX2-FP-NEXT: vzeroupper
5430 ; AVX2-FP-NEXT: retq
5432 ; AVX2-FCP-LABEL: load_i8_stride7_vf32:
5433 ; AVX2-FCP: # %bb.0:
5434 ; AVX2-FCP-NEXT: subq $40, %rsp
5435 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
5436 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm8
5437 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
5438 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5
5439 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
5440 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
5441 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
5442 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5443 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0
5444 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13
5445 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
5446 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
5447 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
5448 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
5449 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5450 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm3, %ymm1
5451 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
5452 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
5453 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5454 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
5455 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5456 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5457 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm1
5458 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
5459 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
5460 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5461 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
5462 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5463 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
5464 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
5465 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10
5466 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5467 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
5468 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
5469 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm2 = [18446744073709551615,18446744073709551615,16777215,0]
5470 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5471 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5472 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5473 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0
5474 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm12
5475 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
5476 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u]
5477 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u]
5478 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
5479 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm1
5480 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
5481 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15]
5482 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5483 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm7 = [0,18446744073709551360,16777215,0]
5484 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
5485 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm1
5486 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u]
5487 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
5488 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u]
5489 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1
5490 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5491 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
5492 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7
5493 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
5494 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7]
5495 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5496 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm8, %ymm0
5498 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
5499 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u]
5500 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
5501 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1
5502 ; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2
5503 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,12]
5504 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
5505 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5506 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7
5507 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10
5508 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5509 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
5510 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
5511 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm10, %ymm7, %ymm7
5512 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5513 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
5514 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm8, %ymm7
5515 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u]
5516 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
5517 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u]
5518 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm10, %xmm7
5519 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13]
5520 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5521 ; AVX2-FCP-NEXT: vpor %xmm10, %xmm14, %xmm10
5522 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5523 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
5524 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm10, %ymm7
5525 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5526 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
5527 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm7
5528 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
5529 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
5530 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
5531 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
5532 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5533 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14]
5534 ; AVX2-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12
5535 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5536 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5537 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm7, %ymm12, %ymm15
5538 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5539 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm12
5540 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
5541 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8
5542 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
5543 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u]
5544 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u]
5545 ; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
5546 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5547 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15]
5548 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
5549 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm2
5550 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5551 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm14
5552 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm6, %ymm5, %ymm11
5553 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2
5554 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm6, %ymm8
5555 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm5, %ymm9
5556 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5557 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm5
5558 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
5559 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
5560 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm4, %ymm13
5561 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm10
5562 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm6
5563 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm3
5564 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
5565 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
5566 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
5567 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2
5568 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
5569 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
5570 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5571 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm7 = [18446744073709551615,255]
5572 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm2
5573 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
5574 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4
5575 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
5576 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0
5577 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm4
5578 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1,2],ymm4[3],ymm13[4,5,6],ymm4[7,8],ymm13[9,10],ymm4[11],ymm13[12,13,14],ymm4[15]
5579 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5580 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4
5581 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm0
5582 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u]
5583 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
5584 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0
5585 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm8
5586 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1,2,3],ymm8[4],ymm10[5,6],ymm8[7,8],ymm10[9,10,11],ymm8[12],ymm10[13,14],ymm8[15]
5587 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5588 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0
5589 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8
5590 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
5591 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u]
5592 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
5593 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
5594 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6,7,8],ymm8[9],ymm6[10,11],ymm8[12],ymm6[13,14,15]
5595 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5596 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
5597 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5598 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm1
5599 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
5600 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
5601 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
5602 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7,8],ymm6[9],ymm3[10,11,12],ymm6[13],ymm3[14,15]
5603 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
5604 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm1
5605 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
5606 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
5607 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
5608 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm6, %xmm3
5609 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,3,5,6]
5610 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
5611 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5612 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
5613 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
5614 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
5615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5616 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload
5617 ; AVX2-FCP-NEXT: # ymm3 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
5618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5619 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
5620 ; AVX2-FCP-NEXT: # ymm4 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
5621 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
5622 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm15[1,2,3,4,5,6,7],ymm5[8],ymm15[9,10,11,12,13,14,15]
5623 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
5624 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
5625 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
5626 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
5627 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
5628 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5629 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
5630 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx)
5631 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8)
5632 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9)
5633 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5634 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax)
5635 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5636 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
5637 ; AVX2-FCP-NEXT: addq $40, %rsp
5638 ; AVX2-FCP-NEXT: vzeroupper
5639 ; AVX2-FCP-NEXT: retq
5641 ; AVX512-LABEL: load_i8_stride7_vf32:
5643 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
5644 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
5645 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
5646 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
5647 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3
5648 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1
5649 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1
5650 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4
5651 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5652 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5653 ; AVX512-NEXT: vpor %xmm4, %xmm1, %xmm1
5654 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5655 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4
5656 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
5657 ; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
5658 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
5659 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5
5660 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
5661 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
5662 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5663 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
5664 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
5665 ; AVX512-NEXT: vmovdqa (%rdi), %ymm6
5666 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7
5667 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
5668 ; AVX512-NEXT: vmovdqa %ymm14, %ymm9
5669 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm9
5670 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10
5671 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
5672 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
5673 ; AVX512-NEXT: vpor %xmm10, %xmm9, %xmm13
5674 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
5675 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9
5676 ; AVX512-NEXT: vmovdqa %ymm11, %ymm15
5677 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15
5678 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
5679 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
5680 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5681 ; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm8
5682 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
5683 ; AVX512-NEXT: vpternlogq $226, %ymm12, %ymm16, %ymm8
5684 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18
5685 ; AVX512-NEXT: vmovdqa %ymm11, %ymm12
5686 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm12
5687 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
5688 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
5689 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
5690 ; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
5691 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5692 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
5693 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
5694 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
5695 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5696 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
5697 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
5698 ; AVX512-NEXT: vmovdqa %ymm13, %ymm12
5699 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm12
5700 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm15
5701 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
5702 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
5703 ; AVX512-NEXT: vpor %xmm15, %xmm12, %xmm15
5704 ; AVX512-NEXT: vmovdqa %ymm14, %ymm12
5705 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm12
5706 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
5707 ; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5708 ; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm12
5709 ; AVX512-NEXT: vpternlogq $226, %ymm8, %ymm16, %ymm12
5710 ; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19
5711 ; AVX512-NEXT: vmovdqa %ymm0, %ymm8
5712 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8
5713 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
5714 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
5715 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
5716 ; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8
5717 ; AVX512-NEXT: vmovdqa %ymm13, %ymm15
5718 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm15
5719 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
5720 ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5721 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
5722 ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15
5723 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm14
5724 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm8
5725 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
5726 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
5727 ; AVX512-NEXT: vpor %xmm8, %xmm14, %xmm8
5728 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5729 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12
5730 ; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm14
5731 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
5732 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
5733 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5734 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
5735 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
5736 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5737 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20
5738 ; AVX512-NEXT: vmovdqa %ymm13, %ymm8
5739 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
5740 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12
5741 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
5742 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
5743 ; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
5744 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5745 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
5746 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5747 ; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12
5748 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5749 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
5750 ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
5751 ; AVX512-NEXT: vmovdqa %ymm11, %ymm8
5752 ; AVX512-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8
5753 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
5754 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
5755 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
5756 ; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8
5757 ; AVX512-NEXT: vmovdqa %ymm0, %ymm14
5758 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14
5759 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
5760 ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5761 ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14
5762 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
5763 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
5764 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm21
5765 ; AVX512-NEXT: vmovdqa %ymm0, %ymm8
5766 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
5767 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
5768 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
5769 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
5770 ; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
5771 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5772 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
5773 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5774 ; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12
5775 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5776 ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
5777 ; AVX512-NEXT: vmovdqa %ymm13, %ymm8
5778 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm8
5779 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm14
5780 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
5781 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
5782 ; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8
5783 ; AVX512-NEXT: vmovdqa %ymm11, %ymm14
5784 ; AVX512-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14
5785 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
5786 ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5787 ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14
5788 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
5789 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
5790 ; AVX512-NEXT: vmovdqa %ymm11, %ymm8
5791 ; AVX512-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
5792 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
5793 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
5794 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
5795 ; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
5796 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5797 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5798 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
5799 ; AVX512-NEXT: vpor %xmm12, %xmm15, %xmm12
5800 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5801 ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
5802 ; AVX512-NEXT: vmovdqa %ymm0, %ymm8
5803 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm8
5804 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm15
5805 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
5806 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
5807 ; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8
5808 ; AVX512-NEXT: vmovdqa %ymm13, %ymm15
5809 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15
5810 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
5811 ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5812 ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15
5813 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
5814 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
5815 ; AVX512-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm13
5816 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2
5817 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
5818 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
5819 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
5820 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5821 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
5822 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
5823 ; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
5824 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5825 ; AVX512-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3
5826 ; AVX512-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm11
5827 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
5828 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm4
5829 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
5830 ; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm2
5831 ; AVX512-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm0
5832 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
5833 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5834 ; AVX512-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0
5835 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
5836 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5837 ; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi)
5838 ; AVX512-NEXT: vmovdqa64 %ymm19, (%rdx)
5839 ; AVX512-NEXT: vmovdqa64 %ymm20, (%rcx)
5840 ; AVX512-NEXT: vmovdqa64 %ymm21, (%r8)
5841 ; AVX512-NEXT: vmovdqa %ymm14, (%r9)
5842 ; AVX512-NEXT: vmovdqa %ymm8, (%r10)
5843 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
5844 ; AVX512-NEXT: vzeroupper
5847 ; AVX512-FCP-LABEL: load_i8_stride7_vf32:
5848 ; AVX512-FCP: # %bb.0:
5849 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5850 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
5851 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
5852 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
5853 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
5854 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
5855 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1
5856 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
5857 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
5858 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
5859 ; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
5860 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5861 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
5862 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
5863 ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4
5864 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
5865 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5866 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
5867 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
5868 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
5869 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
5870 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7
5871 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
5872 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
5873 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
5874 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
5875 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10
5876 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
5877 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
5878 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11
5879 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm11
5880 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
5881 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
5882 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
5883 ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11
5884 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
5885 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm11
5886 ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm18
5887 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
5888 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm6
5889 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
5890 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
5891 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
5892 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
5893 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5894 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
5895 ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
5896 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
5897 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
5898 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
5899 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm10
5900 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm10
5901 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
5902 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
5903 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
5904 ; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14
5905 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm10
5906 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm10
5907 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
5908 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
5909 ; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm10
5910 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm10
5911 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
5912 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6
5913 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
5914 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
5915 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
5916 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
5917 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14
5918 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm14
5919 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
5920 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5921 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
5922 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm14
5923 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm13
5924 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6
5925 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
5926 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
5927 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
5928 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5929 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
5930 ; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
5931 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
5932 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
5933 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
5934 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
5935 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm19
5936 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6
5937 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
5938 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
5939 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
5940 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
5941 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6
5942 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5943 ; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
5944 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
5945 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
5946 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
5947 ; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
5948 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
5949 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
5950 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
5951 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
5952 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6
5953 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
5954 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
5955 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
5956 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
5957 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm15
5958 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15
5959 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
5960 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5961 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
5962 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
5963 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
5964 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20
5965 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
5966 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
5967 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
5968 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
5969 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
5970 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
5971 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5972 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
5973 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
5974 ; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
5975 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
5976 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
5977 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6
5978 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6
5979 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
5980 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
5981 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
5982 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
5983 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm15
5984 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15
5985 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
5986 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5987 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
5988 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
5989 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
5990 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
5991 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
5992 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
5993 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
5994 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
5995 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
5996 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
5997 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5998 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
5999 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
6000 ; AVX512-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
6001 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6002 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
6003 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
6004 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6
6005 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
6006 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6007 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
6008 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
6009 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm15
6010 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm15
6011 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
6012 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6013 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
6014 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6015 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6016 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm11
6017 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2
6018 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6019 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
6020 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
6021 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6022 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6023 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
6024 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
6025 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6026 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3
6027 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm9
6028 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
6029 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4
6030 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6031 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
6032 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm0
6033 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
6034 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6035 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0
6036 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6037 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6038 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi)
6039 ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rdx)
6040 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%rcx)
6041 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%r8)
6042 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r9)
6043 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10)
6044 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
6045 ; AVX512-FCP-NEXT: vzeroupper
6046 ; AVX512-FCP-NEXT: retq
6048 ; AVX512DQ-LABEL: load_i8_stride7_vf32:
6049 ; AVX512DQ: # %bb.0:
6050 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6051 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
6052 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
6053 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
6054 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3
6055 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
6056 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1
6057 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
6058 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
6059 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
6060 ; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1
6061 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6062 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4
6063 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
6064 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6
6065 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
6066 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5
6067 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6068 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
6069 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6070 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
6071 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
6072 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6
6073 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7
6074 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
6075 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm9
6076 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm9
6077 ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10
6078 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
6079 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
6080 ; AVX512DQ-NEXT: vpor %xmm10, %xmm9, %xmm13
6081 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
6082 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9
6083 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15
6084 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15
6085 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
6086 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
6087 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6088 ; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm13, %ymm8
6089 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6090 ; AVX512DQ-NEXT: vpternlogq $226, %ymm12, %ymm16, %ymm8
6091 ; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18
6092 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12
6093 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm12
6094 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
6095 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
6096 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
6097 ; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12
6098 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6099 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6100 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6101 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
6102 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6103 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
6104 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
6105 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm12
6106 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm12
6107 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
6108 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6109 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
6110 ; AVX512DQ-NEXT: vpor %xmm15, %xmm12, %xmm15
6111 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm12
6112 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm12
6113 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
6114 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6115 ; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm15, %ymm12
6116 ; AVX512DQ-NEXT: vpternlogq $226, %ymm8, %ymm16, %ymm12
6117 ; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19
6118 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
6119 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8
6120 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
6121 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
6122 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
6123 ; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8
6124 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15
6125 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm15
6126 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
6127 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6128 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
6129 ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15
6130 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm14
6131 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8
6132 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
6133 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
6134 ; AVX512DQ-NEXT: vpor %xmm8, %xmm14, %xmm8
6135 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6136 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12
6137 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm14
6138 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6139 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
6140 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6141 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
6142 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
6143 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
6144 ; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20
6145 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8
6146 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
6147 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12
6148 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6149 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
6150 ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
6151 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6152 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
6153 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6154 ; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12
6155 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6156 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
6157 ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
6158 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8
6159 ; AVX512DQ-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm8
6160 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
6161 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
6162 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
6163 ; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8
6164 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14
6165 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14
6166 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
6167 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6168 ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14
6169 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
6170 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
6171 ; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21
6172 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
6173 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
6174 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
6175 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
6176 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
6177 ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
6178 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6179 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
6180 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6181 ; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12
6182 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6183 ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
6184 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8
6185 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm8
6186 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm14
6187 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
6188 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
6189 ; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8
6190 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm14
6191 ; AVX512DQ-NEXT: vpternlogq $202, %ymm9, %ymm1, %ymm14
6192 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
6193 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6194 ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm14
6195 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
6196 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
6197 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8
6198 ; AVX512DQ-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm8
6199 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
6200 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
6201 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
6202 ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
6203 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6204 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6205 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
6206 ; AVX512DQ-NEXT: vpor %xmm12, %xmm15, %xmm12
6207 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6208 ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm16, %ymm12
6209 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
6210 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm8
6211 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm15
6212 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6213 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
6214 ; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8
6215 ; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15
6216 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm15
6217 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
6218 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6219 ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm15
6220 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
6221 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
6222 ; AVX512DQ-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm13
6223 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm2
6224 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6225 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
6226 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
6227 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6228 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6229 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
6230 ; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
6231 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6232 ; AVX512DQ-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3
6233 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm11
6234 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
6235 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm4
6236 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6237 ; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm2
6238 ; AVX512DQ-NEXT: vpternlogq $202, %ymm1, %ymm9, %ymm0
6239 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
6240 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6241 ; AVX512DQ-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0
6242 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6243 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6244 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi)
6245 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rdx)
6246 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, (%rcx)
6247 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, (%r8)
6248 ; AVX512DQ-NEXT: vmovdqa %ymm14, (%r9)
6249 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%r10)
6250 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
6251 ; AVX512DQ-NEXT: vzeroupper
6252 ; AVX512DQ-NEXT: retq
6254 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf32:
6255 ; AVX512DQ-FCP: # %bb.0:
6256 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6257 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
6258 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
6259 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
6260 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
6261 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
6262 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm1
6263 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
6264 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
6265 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
6266 ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
6267 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6268 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
6269 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
6270 ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4
6271 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
6272 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
6273 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
6274 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
6275 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
6276 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
6277 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm7
6278 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
6279 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
6280 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
6281 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
6282 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10
6283 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
6284 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
6285 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11
6286 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm11
6287 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
6288 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
6289 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6290 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm11
6291 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
6292 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm11
6293 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm18
6294 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
6295 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm6
6296 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
6297 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
6298 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
6299 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
6300 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6301 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
6302 ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
6303 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
6304 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
6305 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
6306 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10
6307 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm10
6308 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
6309 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
6310 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
6311 ; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14
6312 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm10
6313 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm10
6314 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
6315 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6316 ; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm10
6317 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm6, %ymm16, %ymm10
6318 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
6319 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6
6320 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
6321 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
6322 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
6323 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
6324 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14
6325 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm14
6326 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
6327 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6328 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
6329 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm14
6330 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm13
6331 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6
6332 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
6333 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
6334 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
6335 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6336 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
6337 ; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
6338 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
6339 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
6340 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
6341 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
6342 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm19
6343 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6
6344 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
6345 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
6346 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6347 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
6348 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6
6349 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6350 ; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
6351 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
6352 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
6353 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6354 ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
6355 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6356 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
6357 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
6358 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
6359 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm6
6360 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
6361 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
6362 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
6363 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
6364 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm15
6365 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15
6366 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
6367 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6368 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
6369 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6370 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6371 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20
6372 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
6373 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
6374 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
6375 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
6376 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
6377 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
6378 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6379 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
6380 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6381 ; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
6382 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6383 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
6384 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6
6385 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6
6386 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
6387 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6388 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
6389 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
6390 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm15
6391 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm7, %ymm1, %ymm15
6392 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
6393 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6394 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
6395 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6396 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6397 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
6398 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
6399 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm6
6400 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
6401 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
6402 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
6403 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
6404 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6405 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6406 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
6407 ; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
6408 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6409 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm6, %ymm16, %ymm13
6410 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
6411 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm6
6412 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
6413 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6414 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
6415 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
6416 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm15
6417 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm15
6418 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
6419 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6420 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm15
6421 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
6422 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
6423 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm2, %ymm3, %ymm11
6424 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2
6425 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
6426 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
6427 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
6428 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6429 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6430 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
6431 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
6432 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6433 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm2, %ymm16, %ymm3
6434 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm9
6435 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
6436 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4
6437 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
6438 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
6439 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm1, %ymm7, %ymm0
6440 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
6441 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6442 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm0
6443 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
6444 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6445 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi)
6446 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rdx)
6447 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%rcx)
6448 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%r8)
6449 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r9)
6450 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10)
6451 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
6452 ; AVX512DQ-FCP-NEXT: vzeroupper
6453 ; AVX512DQ-FCP-NEXT: retq
6455 ; AVX512BW-LABEL: load_i8_stride7_vf32:
6456 ; AVX512BW: # %bb.0:
6457 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6458 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
6459 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
6460 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6461 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
6462 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
6463 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm4
6464 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
6465 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm5
6466 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
6467 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm11
6468 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
6469 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm12
6470 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
6471 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm10
6472 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
6473 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm6
6474 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
6475 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
6476 ; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122
6477 ; AVX512BW-NEXT: kmovd %r11d, %k5
6478 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6479 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm7
6480 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6481 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6482 ; AVX512BW-NEXT: vpor %xmm7, %xmm1, %xmm1
6483 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6484 ; AVX512BW-NEXT: movw $992, %r11w # imm = 0x3E0
6485 ; AVX512BW-NEXT: kmovd %r11d, %k1
6486 ; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
6487 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7
6488 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6
6489 ; AVX512BW-NEXT: movw $8772, %r11w # imm = 0x2244
6490 ; AVX512BW-NEXT: kmovd %r11d, %k1
6491 ; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
6492 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm9
6493 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
6494 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
6495 ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
6496 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
6497 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8
6498 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
6499 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
6500 ; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm9
6501 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6502 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
6503 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
6504 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6505 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
6506 ; AVX512BW-NEXT: kmovd %edi, %k4
6507 ; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4}
6508 ; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224
6509 ; AVX512BW-NEXT: kmovd %edi, %k2
6510 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
6511 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm15
6512 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6513 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
6514 ; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6515 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6516 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
6517 ; AVX512BW-NEXT: kmovd %edi, %k3
6518 ; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3}
6519 ; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448
6520 ; AVX512BW-NEXT: kmovd %edi, %k3
6521 ; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
6522 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
6523 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6524 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
6525 ; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6526 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6527 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6528 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6529 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
6530 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
6531 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6532 ; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4}
6533 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
6534 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
6535 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6536 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
6537 ; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6538 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
6539 ; AVX512BW-NEXT: kmovd %edi, %k4
6540 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6541 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
6542 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm15
6543 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
6544 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
6545 ; AVX512BW-NEXT: vpor %xmm15, %xmm12, %xmm12
6546 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6547 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14
6548 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6549 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6550 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6551 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
6552 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
6553 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6554 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
6555 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14
6556 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
6557 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
6558 ; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6559 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6560 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
6561 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6562 ; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6563 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6564 ; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
6565 ; AVX512BW-NEXT: kmovd %edi, %k5
6566 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6567 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6568 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6569 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
6570 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6571 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6572 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6573 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6574 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
6575 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6576 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6577 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6578 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6579 ; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6580 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6581 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
6582 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6583 ; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6584 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6585 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6586 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6587 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
6588 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6589 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6590 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6591 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6592 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6593 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6594 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6595 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6596 ; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6597 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6598 ; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6599 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6600 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6601 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
6602 ; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6603 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6604 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6605 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6606 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
6607 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6608 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6609 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6610 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6611 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6612 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6613 ; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
6614 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
6615 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6616 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6617 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
6618 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6619 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6620 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
6621 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
6622 ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6623 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
6624 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
6625 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6626 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
6627 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6628 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
6629 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6630 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
6631 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6632 ; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi)
6633 ; AVX512BW-NEXT: vmovdqa %ymm10, (%rdx)
6634 ; AVX512BW-NEXT: vmovdqa %ymm12, (%rcx)
6635 ; AVX512BW-NEXT: vmovdqa %ymm11, (%r8)
6636 ; AVX512BW-NEXT: vmovdqa %ymm5, (%r9)
6637 ; AVX512BW-NEXT: vmovdqa %ymm4, (%r10)
6638 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
6639 ; AVX512BW-NEXT: vzeroupper
6640 ; AVX512BW-NEXT: retq
6642 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf32:
6643 ; AVX512BW-FCP: # %bb.0:
6644 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6645 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
6646 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
6647 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6648 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
6649 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
6650 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4
6651 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
6652 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5
6653 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
6654 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9
6655 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
6656 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10
6657 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
6658 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8
6659 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
6660 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6
6661 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
6662 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
6663 ; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
6664 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k5
6665 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6666 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
6667 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6668 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6669 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1
6670 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6671 ; AVX512BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0
6672 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
6673 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
6674 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
6675 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
6676 ; AVX512BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244
6677 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
6678 ; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
6679 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
6680 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
6681 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
6682 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
6683 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
6684 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
6685 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
6686 ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
6687 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
6688 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
6689 ; AVX512BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000
6690 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k4
6691 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4}
6692 ; AVX512BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224
6693 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
6694 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
6695 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
6696 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
6697 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
6698 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
6699 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6700 ; AVX512BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF
6701 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k3
6702 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3}
6703 ; AVX512BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448
6704 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k3
6705 ; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
6706 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
6707 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
6708 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
6709 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
6710 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
6711 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
6712 ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
6713 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
6714 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
6715 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4}
6716 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
6717 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
6718 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
6719 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
6720 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
6721 ; AVX512BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00
6722 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k4
6723 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6724 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
6725 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
6726 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
6727 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
6728 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
6729 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6730 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
6731 ; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
6732 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
6733 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
6734 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
6735 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
6736 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
6737 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
6738 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
6739 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
6740 ; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
6741 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13
6742 ; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11
6743 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
6744 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
6745 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6746 ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
6747 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6748 ; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
6749 ; AVX512BW-FCP-NEXT: kmovd %edi, %k5
6750 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6751 ; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6752 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6753 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
6754 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6755 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
6756 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6757 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6758 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
6759 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6760 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6761 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
6762 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6763 ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
6764 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6765 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
6766 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6767 ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
6768 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6769 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6770 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6771 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
6772 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6773 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6774 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
6775 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6776 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6777 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6778 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6779 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6780 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
6781 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6782 ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
6783 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6784 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6785 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
6786 ; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
6787 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6788 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6789 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6790 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
6791 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6792 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6793 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
6794 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6795 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6796 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6797 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
6798 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
6799 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6800 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6801 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
6802 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6803 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6804 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
6805 ; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
6806 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6807 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
6808 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
6809 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6810 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
6811 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6812 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
6813 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6814 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
6815 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6816 ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rsi)
6817 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
6818 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%rcx)
6819 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
6820 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
6821 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r10)
6822 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
6823 ; AVX512BW-FCP-NEXT: vzeroupper
6824 ; AVX512BW-FCP-NEXT: retq
6826 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf32:
6827 ; AVX512DQ-BW: # %bb.0:
6828 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
6829 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
6830 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
6831 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6832 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
6833 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
6834 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm4
6835 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
6836 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm5
6837 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
6838 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm11
6839 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
6840 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm12
6841 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
6842 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm10
6843 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
6844 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm6
6845 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3
6846 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
6847 ; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122
6848 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k5
6849 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
6850 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm7
6851 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
6852 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
6853 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm1, %xmm1
6854 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
6855 ; AVX512DQ-BW-NEXT: movw $992, %r11w # imm = 0x3E0
6856 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
6857 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
6858 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm7
6859 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6
6860 ; AVX512DQ-BW-NEXT: movw $8772, %r11w # imm = 0x2244
6861 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
6862 ; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
6863 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm9
6864 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
6865 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
6866 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8
6867 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
6868 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8
6869 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
6870 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
6871 ; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm9
6872 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
6873 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
6874 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
6875 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6876 ; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
6877 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4
6878 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4}
6879 ; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224
6880 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
6881 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
6882 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm15
6883 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
6884 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
6885 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6886 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
6887 ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
6888 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3
6889 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3}
6890 ; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448
6891 ; AVX512DQ-BW-NEXT: kmovd %edi, %k3
6892 ; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
6893 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
6894 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6895 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
6896 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6897 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6898 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
6899 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
6900 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
6901 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
6902 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
6903 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4}
6904 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
6905 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
6906 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6907 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
6908 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
6909 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
6910 ; AVX512DQ-BW-NEXT: kmovd %edi, %k4
6911 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6912 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
6913 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm15
6914 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
6915 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
6916 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm12, %xmm12
6917 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
6918 ; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14
6919 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
6920 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
6921 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6922 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
6923 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
6924 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6925 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
6926 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14
6927 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
6928 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
6929 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6930 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6931 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
6932 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
6933 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6934 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6935 ; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
6936 ; AVX512DQ-BW-NEXT: kmovd %edi, %k5
6937 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6938 ; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
6939 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
6940 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
6941 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
6942 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6943 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6944 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6945 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
6946 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
6947 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
6948 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6949 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
6950 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6951 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6952 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
6953 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
6954 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6955 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6956 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6957 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
6958 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
6959 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
6960 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
6961 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6962 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6963 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6964 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6965 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
6966 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
6967 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
6968 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
6969 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
6970 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
6971 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
6972 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
6973 ; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
6974 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
6975 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
6976 ; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
6977 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
6978 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
6979 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
6980 ; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
6981 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6982 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
6983 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
6984 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
6985 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
6986 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
6987 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
6988 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
6989 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6990 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
6991 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
6992 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
6993 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
6994 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
6995 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
6996 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
6997 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
6998 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
6999 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
7000 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7001 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
7002 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7003 ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rsi)
7004 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%rdx)
7005 ; AVX512DQ-BW-NEXT: vmovdqa %ymm12, (%rcx)
7006 ; AVX512DQ-BW-NEXT: vmovdqa %ymm11, (%r8)
7007 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r9)
7008 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%r10)
7009 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
7010 ; AVX512DQ-BW-NEXT: vzeroupper
7011 ; AVX512DQ-BW-NEXT: retq
7013 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf32:
7014 ; AVX512DQ-BW-FCP: # %bb.0:
7015 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7016 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7017 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
7018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
7019 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
7020 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
7021 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4
7022 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
7023 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5
7024 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
7025 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9
7026 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
7027 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10
7028 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
7029 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8
7030 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
7031 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6
7032 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
7033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
7034 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
7035 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k5
7036 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
7037 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
7038 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
7039 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
7040 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1
7041 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
7042 ; AVX512DQ-BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0
7043 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
7044 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
7045 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
7046 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
7047 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244
7048 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
7049 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
7050 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
7051 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
7052 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
7053 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
7054 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
7055 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
7056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
7057 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
7058 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
7059 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
7060 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000
7061 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4
7062 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4}
7063 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224
7064 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
7065 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
7066 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
7067 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
7068 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
7069 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
7070 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
7071 ; AVX512DQ-BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF
7072 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3
7073 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3}
7074 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448
7075 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3
7076 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
7077 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
7078 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
7079 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
7080 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
7081 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
7082 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
7083 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
7084 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
7085 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
7086 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4}
7087 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
7088 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
7089 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
7090 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
7091 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
7092 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00
7093 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4
7094 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7095 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
7096 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
7097 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
7098 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
7099 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
7100 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
7101 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
7102 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
7103 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
7104 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
7105 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
7106 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
7107 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
7108 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
7109 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
7110 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
7111 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
7112 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13
7113 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11
7114 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
7115 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
7116 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
7117 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
7118 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7119 ; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
7120 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5
7121 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
7122 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
7123 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
7124 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
7125 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
7126 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7127 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7128 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7129 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
7130 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
7131 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
7132 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
7133 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
7134 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
7135 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
7136 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
7137 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
7138 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
7139 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7140 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
7141 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
7142 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
7143 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
7144 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
7145 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7146 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7147 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7148 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
7149 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
7150 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
7151 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
7152 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
7153 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
7154 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
7155 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
7156 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
7157 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
7158 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
7159 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
7160 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
7161 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
7162 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
7163 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
7164 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
7165 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7166 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
7167 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
7168 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
7169 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
7170 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
7171 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
7172 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
7173 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
7174 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
7175 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
7176 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
7177 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
7178 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
7179 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
7180 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
7181 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
7182 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
7183 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
7184 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7185 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
7186 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rsi)
7188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
7189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%rcx)
7190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
7191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
7192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r10)
7193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
7194 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
7195 ; AVX512DQ-BW-FCP-NEXT: retq
7196 %wide.vec = load <224 x i8>, ptr %in.vec, align 64
7197 %strided.vec0 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
7198 %strided.vec1 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
7199 %strided.vec2 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
7200 %strided.vec3 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
7201 %strided.vec4 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
7202 %strided.vec5 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
7203 %strided.vec6 = shufflevector <224 x i8> %wide.vec, <224 x i8> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
7204 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
7205 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
7206 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
7207 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
7208 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
7209 store <32 x i8> %strided.vec5, ptr %out.vec5, align 64
7210 store <32 x i8> %strided.vec6, ptr %out.vec6, align 64
7214 define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
7215 ; SSE-LABEL: load_i8_stride7_vf64:
7217 ; SSE-NEXT: subq $1528, %rsp # imm = 0x5F8
7218 ; SSE-NEXT: movdqa 208(%rdi), %xmm12
7219 ; SSE-NEXT: movdqa 192(%rdi), %xmm5
7220 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7221 ; SSE-NEXT: movdqa 176(%rdi), %xmm8
7222 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7223 ; SSE-NEXT: movdqa 112(%rdi), %xmm4
7224 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7225 ; SSE-NEXT: movdqa 128(%rdi), %xmm3
7226 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7227 ; SSE-NEXT: movdqa 160(%rdi), %xmm6
7228 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7229 ; SSE-NEXT: movdqa 144(%rdi), %xmm1
7230 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7231 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,0,65535,65535]
7232 ; SSE-NEXT: movdqa %xmm2, %xmm0
7233 ; SSE-NEXT: pandn %xmm1, %xmm0
7234 ; SSE-NEXT: movdqa %xmm6, %xmm1
7235 ; SSE-NEXT: pand %xmm2, %xmm1
7236 ; SSE-NEXT: movdqa %xmm2, %xmm7
7237 ; SSE-NEXT: por %xmm0, %xmm1
7238 ; SSE-NEXT: pxor %xmm6, %xmm6
7239 ; SSE-NEXT: movdqa %xmm1, %xmm0
7240 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
7241 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
7242 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7243 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
7244 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
7245 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,5,6]
7246 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
7247 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
7248 ; SSE-NEXT: packuswb %xmm0, %xmm2
7249 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
7250 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,65535,0,65535]
7251 ; SSE-NEXT: movdqa %xmm11, %xmm1
7252 ; SSE-NEXT: pandn %xmm3, %xmm1
7253 ; SSE-NEXT: movdqa %xmm4, %xmm3
7254 ; SSE-NEXT: pand %xmm11, %xmm3
7255 ; SSE-NEXT: por %xmm1, %xmm3
7256 ; SSE-NEXT: movdqa %xmm3, %xmm1
7257 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7258 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,0,65535,0,65535]
7259 ; SSE-NEXT: movdqa %xmm10, %xmm4
7260 ; SSE-NEXT: pandn %xmm1, %xmm4
7261 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7262 ; SSE-NEXT: pand %xmm10, %xmm3
7263 ; SSE-NEXT: por %xmm4, %xmm3
7264 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
7265 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7266 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7267 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7268 ; SSE-NEXT: packuswb %xmm1, %xmm1
7269 ; SSE-NEXT: pand %xmm0, %xmm1
7270 ; SSE-NEXT: movdqa %xmm0, %xmm3
7271 ; SSE-NEXT: pandn %xmm2, %xmm3
7272 ; SSE-NEXT: por %xmm3, %xmm1
7273 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,0,65535]
7274 ; SSE-NEXT: movdqa %xmm9, %xmm2
7275 ; SSE-NEXT: pandn %xmm8, %xmm2
7276 ; SSE-NEXT: movdqa %xmm5, %xmm3
7277 ; SSE-NEXT: pand %xmm9, %xmm3
7278 ; SSE-NEXT: por %xmm2, %xmm3
7279 ; SSE-NEXT: movdqa %xmm3, %xmm2
7280 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7281 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7282 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7283 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7284 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7285 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7286 ; SSE-NEXT: movdqa %xmm12, %xmm3
7287 ; SSE-NEXT: movdqa %xmm12, %xmm4
7288 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7289 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7290 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7291 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7292 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7293 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7294 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7295 ; SSE-NEXT: packuswb %xmm3, %xmm3
7296 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,65535,65535,0]
7297 ; SSE-NEXT: movdqa %xmm12, %xmm4
7298 ; SSE-NEXT: pandn %xmm3, %xmm4
7299 ; SSE-NEXT: packuswb %xmm2, %xmm2
7300 ; SSE-NEXT: pand %xmm12, %xmm2
7301 ; SSE-NEXT: por %xmm2, %xmm4
7302 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
7303 ; SSE-NEXT: movdqa %xmm8, %xmm2
7304 ; SSE-NEXT: pandn %xmm4, %xmm2
7305 ; SSE-NEXT: pand %xmm8, %xmm1
7306 ; SSE-NEXT: por %xmm1, %xmm2
7307 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7308 ; SSE-NEXT: movdqa 256(%rdi), %xmm2
7309 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7310 ; SSE-NEXT: movdqa %xmm7, %xmm1
7311 ; SSE-NEXT: pandn %xmm2, %xmm1
7312 ; SSE-NEXT: movdqa 272(%rdi), %xmm2
7313 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
7314 ; SSE-NEXT: pand %xmm7, %xmm2
7315 ; SSE-NEXT: por %xmm1, %xmm2
7316 ; SSE-NEXT: movdqa %xmm2, %xmm1
7317 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7318 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7319 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7320 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7321 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7322 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7323 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7324 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7325 ; SSE-NEXT: packuswb %xmm1, %xmm2
7326 ; SSE-NEXT: movdqa %xmm0, %xmm3
7327 ; SSE-NEXT: pandn %xmm2, %xmm3
7328 ; SSE-NEXT: movdqa 240(%rdi), %xmm2
7329 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7330 ; SSE-NEXT: movdqa %xmm11, %xmm1
7331 ; SSE-NEXT: pandn %xmm2, %xmm1
7332 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
7333 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7334 ; SSE-NEXT: pand %xmm11, %xmm2
7335 ; SSE-NEXT: por %xmm1, %xmm2
7336 ; SSE-NEXT: movdqa %xmm2, %xmm1
7337 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7338 ; SSE-NEXT: movdqa %xmm10, %xmm4
7339 ; SSE-NEXT: pandn %xmm1, %xmm4
7340 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7341 ; SSE-NEXT: pand %xmm10, %xmm2
7342 ; SSE-NEXT: por %xmm4, %xmm2
7343 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
7344 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7345 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7346 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7347 ; SSE-NEXT: packuswb %xmm1, %xmm1
7348 ; SSE-NEXT: pand %xmm0, %xmm1
7349 ; SSE-NEXT: por %xmm3, %xmm1
7350 ; SSE-NEXT: movdqa 288(%rdi), %xmm3
7351 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7352 ; SSE-NEXT: movdqa %xmm9, %xmm2
7353 ; SSE-NEXT: pandn %xmm3, %xmm2
7354 ; SSE-NEXT: movdqa 304(%rdi), %xmm3
7355 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7356 ; SSE-NEXT: pand %xmm9, %xmm3
7357 ; SSE-NEXT: por %xmm2, %xmm3
7358 ; SSE-NEXT: movdqa %xmm3, %xmm2
7359 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7360 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7361 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7362 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7363 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7364 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7365 ; SSE-NEXT: movdqa 320(%rdi), %xmm3
7366 ; SSE-NEXT: movdqa %xmm3, %xmm4
7367 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7368 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7369 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7370 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7371 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7372 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7373 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7374 ; SSE-NEXT: packuswb %xmm3, %xmm3
7375 ; SSE-NEXT: movdqa %xmm12, %xmm4
7376 ; SSE-NEXT: pandn %xmm3, %xmm4
7377 ; SSE-NEXT: packuswb %xmm2, %xmm2
7378 ; SSE-NEXT: pand %xmm12, %xmm2
7379 ; SSE-NEXT: por %xmm2, %xmm4
7380 ; SSE-NEXT: movdqa %xmm8, %xmm2
7381 ; SSE-NEXT: pandn %xmm4, %xmm2
7382 ; SSE-NEXT: pand %xmm8, %xmm1
7383 ; SSE-NEXT: por %xmm1, %xmm2
7384 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7385 ; SSE-NEXT: movdqa 368(%rdi), %xmm2
7386 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7387 ; SSE-NEXT: movdqa %xmm7, %xmm1
7388 ; SSE-NEXT: pandn %xmm2, %xmm1
7389 ; SSE-NEXT: movdqa 384(%rdi), %xmm2
7390 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7391 ; SSE-NEXT: pand %xmm7, %xmm2
7392 ; SSE-NEXT: por %xmm1, %xmm2
7393 ; SSE-NEXT: movdqa %xmm2, %xmm1
7394 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7395 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7396 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7397 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7398 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7399 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7400 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7401 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7402 ; SSE-NEXT: packuswb %xmm1, %xmm2
7403 ; SSE-NEXT: movdqa %xmm0, %xmm3
7404 ; SSE-NEXT: pandn %xmm2, %xmm3
7405 ; SSE-NEXT: movdqa 352(%rdi), %xmm2
7406 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7407 ; SSE-NEXT: movdqa %xmm11, %xmm1
7408 ; SSE-NEXT: pandn %xmm2, %xmm1
7409 ; SSE-NEXT: movdqa 336(%rdi), %xmm2
7410 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7411 ; SSE-NEXT: pand %xmm11, %xmm2
7412 ; SSE-NEXT: por %xmm1, %xmm2
7413 ; SSE-NEXT: movdqa %xmm2, %xmm1
7414 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7415 ; SSE-NEXT: movdqa %xmm10, %xmm4
7416 ; SSE-NEXT: pandn %xmm1, %xmm4
7417 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7418 ; SSE-NEXT: pand %xmm10, %xmm2
7419 ; SSE-NEXT: por %xmm4, %xmm2
7420 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
7421 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7422 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7423 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7424 ; SSE-NEXT: packuswb %xmm1, %xmm1
7425 ; SSE-NEXT: pand %xmm0, %xmm1
7426 ; SSE-NEXT: por %xmm3, %xmm1
7427 ; SSE-NEXT: movdqa 400(%rdi), %xmm3
7428 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7429 ; SSE-NEXT: movdqa %xmm9, %xmm2
7430 ; SSE-NEXT: pandn %xmm3, %xmm2
7431 ; SSE-NEXT: movdqa 416(%rdi), %xmm14
7432 ; SSE-NEXT: movdqa %xmm14, %xmm3
7433 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7434 ; SSE-NEXT: pand %xmm9, %xmm3
7435 ; SSE-NEXT: por %xmm2, %xmm3
7436 ; SSE-NEXT: movdqa %xmm3, %xmm2
7437 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7438 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
7439 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
7440 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7441 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
7442 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
7443 ; SSE-NEXT: movdqa 432(%rdi), %xmm3
7444 ; SSE-NEXT: movdqa %xmm3, %xmm4
7445 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
7446 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7447 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
7448 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7449 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7450 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7451 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
7452 ; SSE-NEXT: packuswb %xmm3, %xmm3
7453 ; SSE-NEXT: movdqa %xmm12, %xmm4
7454 ; SSE-NEXT: pandn %xmm3, %xmm4
7455 ; SSE-NEXT: packuswb %xmm2, %xmm2
7456 ; SSE-NEXT: pand %xmm12, %xmm2
7457 ; SSE-NEXT: por %xmm2, %xmm4
7458 ; SSE-NEXT: movdqa %xmm8, %xmm2
7459 ; SSE-NEXT: pandn %xmm4, %xmm2
7460 ; SSE-NEXT: pand %xmm8, %xmm1
7461 ; SSE-NEXT: por %xmm1, %xmm2
7462 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7463 ; SSE-NEXT: movdqa 32(%rdi), %xmm15
7464 ; SSE-NEXT: movdqa %xmm7, %xmm1
7465 ; SSE-NEXT: pandn %xmm15, %xmm1
7466 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
7467 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7468 ; SSE-NEXT: pand %xmm7, %xmm2
7469 ; SSE-NEXT: por %xmm1, %xmm2
7470 ; SSE-NEXT: movdqa %xmm2, %xmm1
7471 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7472 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
7473 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7474 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7475 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7476 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,6]
7477 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
7478 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7479 ; SSE-NEXT: packuswb %xmm1, %xmm2
7480 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
7481 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7482 ; SSE-NEXT: movdqa %xmm11, %xmm1
7483 ; SSE-NEXT: pandn %xmm3, %xmm1
7484 ; SSE-NEXT: movdqa (%rdi), %xmm4
7485 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7486 ; SSE-NEXT: pand %xmm11, %xmm4
7487 ; SSE-NEXT: por %xmm1, %xmm4
7488 ; SSE-NEXT: movdqa %xmm4, %xmm1
7489 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
7490 ; SSE-NEXT: movdqa %xmm10, %xmm5
7491 ; SSE-NEXT: pandn %xmm1, %xmm5
7492 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
7493 ; SSE-NEXT: pand %xmm10, %xmm4
7494 ; SSE-NEXT: por %xmm5, %xmm4
7495 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,1,3]
7496 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7497 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
7498 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
7499 ; SSE-NEXT: packuswb %xmm1, %xmm1
7500 ; SSE-NEXT: pand %xmm0, %xmm1
7501 ; SSE-NEXT: pandn %xmm2, %xmm0
7502 ; SSE-NEXT: por %xmm0, %xmm1
7503 ; SSE-NEXT: movdqa 64(%rdi), %xmm2
7504 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7505 ; SSE-NEXT: movdqa %xmm9, %xmm0
7506 ; SSE-NEXT: pandn %xmm2, %xmm0
7507 ; SSE-NEXT: movdqa 80(%rdi), %xmm2
7508 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7509 ; SSE-NEXT: pand %xmm9, %xmm2
7510 ; SSE-NEXT: por %xmm0, %xmm2
7511 ; SSE-NEXT: movdqa %xmm2, %xmm0
7512 ; SSE-NEXT: pxor %xmm5, %xmm5
7513 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
7514 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
7515 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
7516 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7517 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
7518 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
7519 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
7520 ; SSE-NEXT: movdqa %xmm2, %xmm3
7521 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
7522 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7523 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
7524 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7525 ; SSE-NEXT: pxor %xmm7, %xmm7
7526 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7527 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7528 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
7529 ; SSE-NEXT: packuswb %xmm2, %xmm2
7530 ; SSE-NEXT: movdqa %xmm12, %xmm4
7531 ; SSE-NEXT: pandn %xmm2, %xmm4
7532 ; SSE-NEXT: packuswb %xmm0, %xmm0
7533 ; SSE-NEXT: pand %xmm12, %xmm0
7534 ; SSE-NEXT: por %xmm0, %xmm4
7535 ; SSE-NEXT: pand %xmm8, %xmm1
7536 ; SSE-NEXT: pandn %xmm4, %xmm8
7537 ; SSE-NEXT: por %xmm1, %xmm8
7538 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7539 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,0,65535,65535]
7540 ; SSE-NEXT: movdqa %xmm2, %xmm0
7541 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7542 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7543 ; SSE-NEXT: pand %xmm2, %xmm1
7544 ; SSE-NEXT: movdqa %xmm2, %xmm13
7545 ; SSE-NEXT: por %xmm0, %xmm1
7546 ; SSE-NEXT: movdqa %xmm1, %xmm2
7547 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7548 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
7549 ; SSE-NEXT: movdqa %xmm0, %xmm4
7550 ; SSE-NEXT: pandn %xmm2, %xmm4
7551 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7552 ; SSE-NEXT: pand %xmm0, %xmm1
7553 ; SSE-NEXT: por %xmm4, %xmm1
7554 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
7555 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
7556 ; SSE-NEXT: psrld $16, %xmm2
7557 ; SSE-NEXT: packuswb %xmm2, %xmm1
7558 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
7559 ; SSE-NEXT: movdqa %xmm4, %xmm2
7560 ; SSE-NEXT: movdqa %xmm4, %xmm8
7561 ; SSE-NEXT: pandn %xmm1, %xmm2
7562 ; SSE-NEXT: movdqa %xmm9, %xmm1
7563 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7564 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7565 ; SSE-NEXT: pand %xmm9, %xmm4
7566 ; SSE-NEXT: por %xmm1, %xmm4
7567 ; SSE-NEXT: movdqa %xmm4, %xmm1
7568 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7569 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,65535,65535,0,65535]
7570 ; SSE-NEXT: movdqa %xmm6, %xmm5
7571 ; SSE-NEXT: pandn %xmm1, %xmm5
7572 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
7573 ; SSE-NEXT: pand %xmm6, %xmm4
7574 ; SSE-NEXT: por %xmm5, %xmm4
7575 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
7576 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7577 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7578 ; SSE-NEXT: packuswb %xmm1, %xmm1
7579 ; SSE-NEXT: pand %xmm8, %xmm1
7580 ; SSE-NEXT: por %xmm2, %xmm1
7581 ; SSE-NEXT: movdqa %xmm11, %xmm2
7582 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7583 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7584 ; SSE-NEXT: pand %xmm11, %xmm4
7585 ; SSE-NEXT: por %xmm2, %xmm4
7586 ; SSE-NEXT: movdqa %xmm4, %xmm2
7587 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7588 ; SSE-NEXT: movdqa %xmm10, %xmm5
7589 ; SSE-NEXT: pandn %xmm2, %xmm5
7590 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7591 ; SSE-NEXT: pand %xmm10, %xmm4
7592 ; SSE-NEXT: por %xmm5, %xmm4
7593 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7594 ; SSE-NEXT: pslld $16, %xmm2
7595 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7596 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7597 ; SSE-NEXT: packuswb %xmm5, %xmm2
7598 ; SSE-NEXT: movdqa %xmm12, %xmm5
7599 ; SSE-NEXT: pandn %xmm2, %xmm5
7600 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7601 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7602 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7603 ; SSE-NEXT: packuswb %xmm2, %xmm2
7604 ; SSE-NEXT: pand %xmm12, %xmm2
7605 ; SSE-NEXT: por %xmm2, %xmm5
7606 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
7607 ; SSE-NEXT: movdqa %xmm4, %xmm2
7608 ; SSE-NEXT: pandn %xmm5, %xmm2
7609 ; SSE-NEXT: pand %xmm4, %xmm1
7610 ; SSE-NEXT: movdqa %xmm4, %xmm3
7611 ; SSE-NEXT: por %xmm1, %xmm2
7612 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7613 ; SSE-NEXT: movdqa %xmm13, %xmm1
7614 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7615 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
7616 ; SSE-NEXT: pand %xmm13, %xmm2
7617 ; SSE-NEXT: por %xmm1, %xmm2
7618 ; SSE-NEXT: movdqa %xmm2, %xmm1
7619 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
7620 ; SSE-NEXT: movdqa %xmm0, %xmm4
7621 ; SSE-NEXT: pandn %xmm1, %xmm4
7622 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
7623 ; SSE-NEXT: pand %xmm0, %xmm2
7624 ; SSE-NEXT: por %xmm4, %xmm2
7625 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7626 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
7627 ; SSE-NEXT: psrld $16, %xmm1
7628 ; SSE-NEXT: packuswb %xmm1, %xmm2
7629 ; SSE-NEXT: movdqa %xmm8, %xmm4
7630 ; SSE-NEXT: pandn %xmm2, %xmm4
7631 ; SSE-NEXT: movdqa %xmm9, %xmm1
7632 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7633 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7634 ; SSE-NEXT: pand %xmm9, %xmm2
7635 ; SSE-NEXT: por %xmm1, %xmm2
7636 ; SSE-NEXT: movdqa %xmm2, %xmm1
7637 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7638 ; SSE-NEXT: movdqa %xmm6, %xmm5
7639 ; SSE-NEXT: pandn %xmm1, %xmm5
7640 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7641 ; SSE-NEXT: pand %xmm6, %xmm2
7642 ; SSE-NEXT: por %xmm5, %xmm2
7643 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
7644 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7645 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7646 ; SSE-NEXT: packuswb %xmm1, %xmm1
7647 ; SSE-NEXT: pand %xmm8, %xmm1
7648 ; SSE-NEXT: por %xmm4, %xmm1
7649 ; SSE-NEXT: movdqa %xmm11, %xmm2
7650 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
7651 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7652 ; SSE-NEXT: pand %xmm11, %xmm4
7653 ; SSE-NEXT: por %xmm2, %xmm4
7654 ; SSE-NEXT: movdqa %xmm4, %xmm2
7655 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7656 ; SSE-NEXT: movdqa %xmm10, %xmm5
7657 ; SSE-NEXT: pandn %xmm2, %xmm5
7658 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7659 ; SSE-NEXT: pand %xmm10, %xmm4
7660 ; SSE-NEXT: por %xmm5, %xmm4
7661 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7662 ; SSE-NEXT: pslld $16, %xmm2
7663 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7664 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7665 ; SSE-NEXT: packuswb %xmm5, %xmm2
7666 ; SSE-NEXT: movdqa %xmm12, %xmm5
7667 ; SSE-NEXT: pandn %xmm2, %xmm5
7668 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7669 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7670 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7671 ; SSE-NEXT: packuswb %xmm2, %xmm2
7672 ; SSE-NEXT: pand %xmm12, %xmm2
7673 ; SSE-NEXT: por %xmm2, %xmm5
7674 ; SSE-NEXT: movdqa %xmm3, %xmm2
7675 ; SSE-NEXT: pandn %xmm5, %xmm2
7676 ; SSE-NEXT: pand %xmm3, %xmm1
7677 ; SSE-NEXT: por %xmm1, %xmm2
7678 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7679 ; SSE-NEXT: movdqa %xmm13, %xmm1
7680 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7681 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7682 ; SSE-NEXT: pand %xmm13, %xmm2
7683 ; SSE-NEXT: por %xmm1, %xmm2
7684 ; SSE-NEXT: movdqa %xmm2, %xmm1
7685 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm7[8],xmm1[9],xmm7[9],xmm1[10],xmm7[10],xmm1[11],xmm7[11],xmm1[12],xmm7[12],xmm1[13],xmm7[13],xmm1[14],xmm7[14],xmm1[15],xmm7[15]
7686 ; SSE-NEXT: movdqa %xmm0, %xmm4
7687 ; SSE-NEXT: pandn %xmm1, %xmm4
7688 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3],xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7]
7689 ; SSE-NEXT: pand %xmm0, %xmm2
7690 ; SSE-NEXT: por %xmm4, %xmm2
7691 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
7692 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
7693 ; SSE-NEXT: psrld $16, %xmm1
7694 ; SSE-NEXT: packuswb %xmm1, %xmm2
7695 ; SSE-NEXT: movdqa %xmm8, %xmm4
7696 ; SSE-NEXT: pandn %xmm2, %xmm4
7697 ; SSE-NEXT: movdqa %xmm9, %xmm1
7698 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7699 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7700 ; SSE-NEXT: pand %xmm9, %xmm2
7701 ; SSE-NEXT: por %xmm1, %xmm2
7702 ; SSE-NEXT: movdqa %xmm2, %xmm1
7703 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
7704 ; SSE-NEXT: movdqa %xmm6, %xmm5
7705 ; SSE-NEXT: pandn %xmm1, %xmm5
7706 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7707 ; SSE-NEXT: pand %xmm6, %xmm2
7708 ; SSE-NEXT: por %xmm5, %xmm2
7709 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
7710 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
7711 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
7712 ; SSE-NEXT: packuswb %xmm1, %xmm1
7713 ; SSE-NEXT: pand %xmm8, %xmm1
7714 ; SSE-NEXT: por %xmm4, %xmm1
7715 ; SSE-NEXT: movdqa %xmm11, %xmm2
7716 ; SSE-NEXT: pandn %xmm14, %xmm2
7717 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7718 ; SSE-NEXT: pand %xmm11, %xmm4
7719 ; SSE-NEXT: por %xmm2, %xmm4
7720 ; SSE-NEXT: movdqa %xmm4, %xmm2
7721 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
7722 ; SSE-NEXT: movdqa %xmm10, %xmm5
7723 ; SSE-NEXT: pandn %xmm2, %xmm5
7724 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
7725 ; SSE-NEXT: pand %xmm10, %xmm4
7726 ; SSE-NEXT: por %xmm5, %xmm4
7727 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7728 ; SSE-NEXT: pslld $16, %xmm2
7729 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7730 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
7731 ; SSE-NEXT: packuswb %xmm5, %xmm2
7732 ; SSE-NEXT: movdqa %xmm12, %xmm5
7733 ; SSE-NEXT: pandn %xmm2, %xmm5
7734 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
7735 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,3,2,4,5,6,7]
7736 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,6,7]
7737 ; SSE-NEXT: packuswb %xmm2, %xmm2
7738 ; SSE-NEXT: pand %xmm12, %xmm2
7739 ; SSE-NEXT: por %xmm2, %xmm5
7740 ; SSE-NEXT: movdqa %xmm3, %xmm2
7741 ; SSE-NEXT: pandn %xmm5, %xmm2
7742 ; SSE-NEXT: pand %xmm3, %xmm1
7743 ; SSE-NEXT: por %xmm1, %xmm2
7744 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7745 ; SSE-NEXT: movdqa %xmm13, %xmm1
7746 ; SSE-NEXT: pandn %xmm15, %xmm1
7747 ; SSE-NEXT: movdqa %xmm15, %xmm3
7748 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7749 ; SSE-NEXT: movdqa %xmm15, %xmm2
7750 ; SSE-NEXT: pand %xmm13, %xmm2
7751 ; SSE-NEXT: por %xmm1, %xmm2
7752 ; SSE-NEXT: movdqa %xmm2, %xmm1
7753 ; SSE-NEXT: pxor %xmm4, %xmm4
7754 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
7755 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
7756 ; SSE-NEXT: pxor %xmm5, %xmm5
7757 ; SSE-NEXT: pand %xmm0, %xmm2
7758 ; SSE-NEXT: pandn %xmm1, %xmm0
7759 ; SSE-NEXT: por %xmm2, %xmm0
7760 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
7761 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,6]
7762 ; SSE-NEXT: psrld $16, %xmm1
7763 ; SSE-NEXT: packuswb %xmm1, %xmm0
7764 ; SSE-NEXT: movdqa %xmm8, %xmm1
7765 ; SSE-NEXT: pandn %xmm0, %xmm1
7766 ; SSE-NEXT: movdqa %xmm9, %xmm0
7767 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
7768 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7769 ; SSE-NEXT: pand %xmm9, %xmm2
7770 ; SSE-NEXT: movdqa %xmm9, %xmm12
7771 ; SSE-NEXT: por %xmm0, %xmm2
7772 ; SSE-NEXT: movdqa %xmm2, %xmm0
7773 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
7774 ; SSE-NEXT: movdqa %xmm6, %xmm4
7775 ; SSE-NEXT: pandn %xmm0, %xmm4
7776 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
7777 ; SSE-NEXT: pand %xmm6, %xmm2
7778 ; SSE-NEXT: por %xmm4, %xmm2
7779 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,3]
7780 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
7781 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
7782 ; SSE-NEXT: packuswb %xmm0, %xmm0
7783 ; SSE-NEXT: pand %xmm8, %xmm0
7784 ; SSE-NEXT: por %xmm1, %xmm0
7785 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7786 ; SSE-NEXT: movdqa %xmm11, %xmm0
7787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7788 ; SSE-NEXT: pandn %xmm14, %xmm0
7789 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7790 ; SSE-NEXT: pand %xmm11, %xmm1
7791 ; SSE-NEXT: por %xmm0, %xmm1
7792 ; SSE-NEXT: movdqa %xmm1, %xmm0
7793 ; SSE-NEXT: pxor %xmm2, %xmm2
7794 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
7795 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
7796 ; SSE-NEXT: pand %xmm10, %xmm1
7797 ; SSE-NEXT: pandn %xmm0, %xmm10
7798 ; SSE-NEXT: por %xmm1, %xmm10
7799 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7800 ; SSE-NEXT: movdqa %xmm11, %xmm0
7801 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7802 ; SSE-NEXT: pandn %xmm7, %xmm0
7803 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7804 ; SSE-NEXT: movdqa %xmm2, %xmm1
7805 ; SSE-NEXT: pand %xmm11, %xmm1
7806 ; SSE-NEXT: por %xmm0, %xmm1
7807 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7808 ; SSE-NEXT: movdqa %xmm11, %xmm0
7809 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload
7810 ; SSE-NEXT: pandn %xmm8, %xmm0
7811 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7812 ; SSE-NEXT: movdqa %xmm5, %xmm1
7813 ; SSE-NEXT: pand %xmm11, %xmm1
7814 ; SSE-NEXT: por %xmm0, %xmm1
7815 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7816 ; SSE-NEXT: movdqa %xmm11, %xmm0
7817 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7818 ; SSE-NEXT: pandn %xmm9, %xmm0
7819 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7820 ; SSE-NEXT: movdqa %xmm1, %xmm4
7821 ; SSE-NEXT: pand %xmm11, %xmm4
7822 ; SSE-NEXT: por %xmm0, %xmm4
7823 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7824 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7825 ; SSE-NEXT: movdqa %xmm3, %xmm0
7826 ; SSE-NEXT: pand %xmm11, %xmm0
7827 ; SSE-NEXT: movdqa %xmm15, %xmm6
7828 ; SSE-NEXT: pandn %xmm15, %xmm11
7829 ; SSE-NEXT: por %xmm0, %xmm11
7830 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7831 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,65535,65535,0,65535,65535,65535]
7832 ; SSE-NEXT: movdqa %xmm15, %xmm0
7833 ; SSE-NEXT: pandn %xmm2, %xmm0
7834 ; SSE-NEXT: movdqa %xmm12, %xmm2
7835 ; SSE-NEXT: movdqa %xmm7, %xmm4
7836 ; SSE-NEXT: pandn %xmm7, %xmm2
7837 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7838 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
7839 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7840 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7841 ; SSE-NEXT: pand %xmm15, %xmm4
7842 ; SSE-NEXT: por %xmm0, %xmm4
7843 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7844 ; SSE-NEXT: movdqa %xmm15, %xmm0
7845 ; SSE-NEXT: pandn %xmm5, %xmm0
7846 ; SSE-NEXT: movdqa %xmm12, %xmm2
7847 ; SSE-NEXT: movdqa %xmm8, %xmm4
7848 ; SSE-NEXT: pandn %xmm8, %xmm2
7849 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7850 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
7851 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7852 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7853 ; SSE-NEXT: pand %xmm15, %xmm4
7854 ; SSE-NEXT: por %xmm0, %xmm4
7855 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
7856 ; SSE-NEXT: movdqa %xmm15, %xmm0
7857 ; SSE-NEXT: pandn %xmm1, %xmm0
7858 ; SSE-NEXT: movdqa %xmm12, %xmm2
7859 ; SSE-NEXT: movdqa %xmm12, %xmm1
7860 ; SSE-NEXT: movdqa %xmm9, %xmm4
7861 ; SSE-NEXT: pandn %xmm9, %xmm1
7862 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7863 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,2,3]
7864 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7865 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7866 ; SSE-NEXT: pand %xmm15, %xmm4
7867 ; SSE-NEXT: por %xmm0, %xmm4
7868 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7869 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7870 ; SSE-NEXT: pand %xmm15, %xmm9
7871 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7872 ; SSE-NEXT: pand %xmm15, %xmm12
7873 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7874 ; SSE-NEXT: pand %xmm15, %xmm0
7875 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7876 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7877 ; SSE-NEXT: pand %xmm15, %xmm0
7878 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7879 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7880 ; SSE-NEXT: pand %xmm15, %xmm0
7881 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7882 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7883 ; SSE-NEXT: pand %xmm15, %xmm0
7884 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7885 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7886 ; SSE-NEXT: pand %xmm15, %xmm0
7887 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7888 ; SSE-NEXT: movdqa %xmm14, %xmm0
7889 ; SSE-NEXT: pand %xmm15, %xmm0
7890 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7891 ; SSE-NEXT: movdqa %xmm2, %xmm4
7892 ; SSE-NEXT: movdqa %xmm6, %xmm0
7893 ; SSE-NEXT: pandn %xmm6, %xmm4
7894 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7895 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
7896 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7897 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7898 ; SSE-NEXT: pand %xmm15, %xmm0
7899 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7900 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7901 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7902 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7903 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7904 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7905 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7906 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7907 ; SSE-NEXT: pandn %xmm3, %xmm15
7908 ; SSE-NEXT: por %xmm0, %xmm15
7909 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7910 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
7911 ; SSE-NEXT: movdqa %xmm1, %xmm2
7912 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
7913 ; SSE-NEXT: pandn %xmm7, %xmm2
7914 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7915 ; SSE-NEXT: movdqa %xmm7, %xmm10
7916 ; SSE-NEXT: movdqa %xmm7, %xmm4
7917 ; SSE-NEXT: movdqa %xmm1, %xmm2
7918 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7919 ; SSE-NEXT: pandn %xmm6, %xmm2
7920 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7921 ; SSE-NEXT: movdqa %xmm6, %xmm8
7922 ; SSE-NEXT: movdqa %xmm1, %xmm2
7923 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7924 ; SSE-NEXT: pandn %xmm5, %xmm2
7925 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7926 ; SSE-NEXT: movdqa %xmm5, %xmm1
7927 ; SSE-NEXT: movdqa %xmm5, %xmm11
7928 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7929 ; SSE-NEXT: movdqa %xmm3, %xmm13
7930 ; SSE-NEXT: pslld $16, %xmm13
7931 ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7932 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
7933 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
7934 ; SSE-NEXT: movdqa %xmm6, %xmm0
7935 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7936 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7937 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
7938 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7939 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7940 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7941 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7942 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7943 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7944 ; SSE-NEXT: movdqa %xmm1, %xmm2
7945 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
7946 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7947 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7948 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
7949 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7950 ; SSE-NEXT: movdqa %xmm14, %xmm4
7951 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
7952 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7953 ; SSE-NEXT: movdqa %xmm15, %xmm8
7954 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
7955 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7956 ; SSE-NEXT: movdqa %xmm0, %xmm11
7957 ; SSE-NEXT: movdqa %xmm1, %xmm0
7958 ; SSE-NEXT: movdqa %xmm1, %xmm2
7959 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7960 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7961 ; SSE-NEXT: movdqa %xmm3, %xmm1
7962 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7963 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7964 ; SSE-NEXT: pxor %xmm0, %xmm0
7965 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
7966 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,7,5,6,7]
7967 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,2,0]
7968 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,4,6,5]
7969 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535]
7970 ; SSE-NEXT: pand %xmm14, %xmm3
7971 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7972 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7973 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7974 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
7975 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,7,5,6,7]
7976 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7977 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5]
7978 ; SSE-NEXT: pand %xmm14, %xmm3
7979 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7980 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7981 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7982 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
7983 ; SSE-NEXT: pxor %xmm3, %xmm3
7984 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,5,6,7]
7985 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7986 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,4,6,5]
7987 ; SSE-NEXT: pand %xmm14, %xmm0
7988 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7989 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7990 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7991 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7992 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm0[0,1,2,3,7,5,6,7]
7993 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
7994 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,6,4,6,5]
7995 ; SSE-NEXT: movdqa %xmm14, %xmm0
7996 ; SSE-NEXT: pand %xmm14, %xmm3
7997 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7998 ; SSE-NEXT: movdqa %xmm14, %xmm3
7999 ; SSE-NEXT: pandn %xmm4, %xmm3
8000 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8001 ; SSE-NEXT: pand %xmm14, %xmm7
8002 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8003 ; SSE-NEXT: movdqa %xmm14, %xmm3
8004 ; SSE-NEXT: pandn %xmm8, %xmm3
8005 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8006 ; SSE-NEXT: pand %xmm14, %xmm6
8007 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8008 ; SSE-NEXT: movdqa %xmm14, %xmm3
8009 ; SSE-NEXT: movdqa %xmm11, %xmm6
8010 ; SSE-NEXT: pandn %xmm11, %xmm3
8011 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8012 ; SSE-NEXT: pand %xmm14, %xmm5
8013 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8014 ; SSE-NEXT: movdqa %xmm2, %xmm3
8015 ; SSE-NEXT: pand %xmm14, %xmm3
8016 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8017 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8018 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8019 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8020 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8021 ; SSE-NEXT: pandn %xmm1, %xmm0
8022 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8023 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8024 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8025 ; SSE-NEXT: pxor %xmm0, %xmm0
8026 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8027 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
8028 ; SSE-NEXT: pand %xmm11, %xmm3
8029 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8030 ; SSE-NEXT: pand %xmm11, %xmm4
8031 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8032 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
8033 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8034 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8035 ; SSE-NEXT: pand %xmm11, %xmm3
8036 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
8037 ; SSE-NEXT: pand %xmm11, %xmm8
8038 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8039 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8040 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8041 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
8042 ; SSE-NEXT: pxor %xmm8, %xmm8
8043 ; SSE-NEXT: pand %xmm11, %xmm3
8044 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8045 ; SSE-NEXT: pand %xmm11, %xmm6
8046 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8047 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8048 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8049 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
8050 ; SSE-NEXT: pand %xmm11, %xmm0
8051 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8052 ; SSE-NEXT: pand %xmm11, %xmm1
8053 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8054 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8055 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8056 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8057 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8058 ; SSE-NEXT: pandn %xmm2, %xmm11
8059 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8060 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
8061 ; SSE-NEXT: packuswb %xmm2, %xmm3
8062 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,65535,0]
8063 ; SSE-NEXT: movdqa %xmm0, %xmm15
8064 ; SSE-NEXT: pandn %xmm3, %xmm15
8065 ; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8066 ; SSE-NEXT: # xmm3 = mem[0,3,2,3]
8067 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
8068 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
8069 ; SSE-NEXT: packuswb %xmm3, %xmm3
8070 ; SSE-NEXT: pand %xmm0, %xmm3
8071 ; SSE-NEXT: movdqa %xmm0, %xmm4
8072 ; SSE-NEXT: por %xmm3, %xmm15
8073 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
8074 ; SSE-NEXT: movdqa %xmm0, %xmm3
8075 ; SSE-NEXT: pandn %xmm15, %xmm3
8076 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8077 ; SSE-NEXT: pand %xmm0, %xmm2
8078 ; SSE-NEXT: movdqa %xmm0, %xmm13
8079 ; SSE-NEXT: por %xmm2, %xmm3
8080 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8081 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,0,65535]
8082 ; SSE-NEXT: movdqa %xmm0, %xmm3
8083 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8084 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8085 ; SSE-NEXT: pand %xmm0, %xmm15
8086 ; SSE-NEXT: movdqa %xmm0, %xmm5
8087 ; SSE-NEXT: por %xmm3, %xmm15
8088 ; SSE-NEXT: movdqa %xmm15, %xmm3
8089 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8090 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,65535,65535,0,65535]
8091 ; SSE-NEXT: movdqa %xmm14, %xmm0
8092 ; SSE-NEXT: pandn %xmm3, %xmm0
8093 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
8094 ; SSE-NEXT: pand %xmm14, %xmm15
8095 ; SSE-NEXT: por %xmm0, %xmm15
8096 ; SSE-NEXT: packuswb %xmm10, %xmm0
8097 ; SSE-NEXT: movdqa %xmm4, %xmm2
8098 ; SSE-NEXT: pandn %xmm0, %xmm2
8099 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,3,2,3]
8100 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
8101 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
8102 ; SSE-NEXT: packuswb %xmm0, %xmm0
8103 ; SSE-NEXT: pand %xmm4, %xmm0
8104 ; SSE-NEXT: por %xmm0, %xmm2
8105 ; SSE-NEXT: movdqa %xmm13, %xmm3
8106 ; SSE-NEXT: pandn %xmm2, %xmm3
8107 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8108 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8109 ; SSE-NEXT: por %xmm0, %xmm9
8110 ; SSE-NEXT: movdqa %xmm9, %xmm0
8111 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8112 ; SSE-NEXT: movdqa %xmm14, %xmm2
8113 ; SSE-NEXT: pandn %xmm0, %xmm2
8114 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
8115 ; SSE-NEXT: pand %xmm14, %xmm9
8116 ; SSE-NEXT: por %xmm2, %xmm9
8117 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8118 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,3,2,3]
8119 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8120 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8121 ; SSE-NEXT: movdqa %xmm1, %xmm2
8122 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8123 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535]
8124 ; SSE-NEXT: movdqa %xmm0, %xmm15
8125 ; SSE-NEXT: pandn %xmm2, %xmm15
8126 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8127 ; SSE-NEXT: pand %xmm0, %xmm1
8128 ; SSE-NEXT: por %xmm15, %xmm1
8129 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,1,2,1]
8130 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
8131 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8132 ; SSE-NEXT: packuswb %xmm2, %xmm11
8133 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,255,255,255,255,255,255,255]
8134 ; SSE-NEXT: movdqa %xmm6, %xmm2
8135 ; SSE-NEXT: pandn %xmm11, %xmm2
8136 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,2,1,3]
8137 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
8138 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
8139 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
8140 ; SSE-NEXT: packuswb %xmm1, %xmm1
8141 ; SSE-NEXT: pand %xmm6, %xmm1
8142 ; SSE-NEXT: por %xmm1, %xmm2
8143 ; SSE-NEXT: pand %xmm13, %xmm2
8144 ; SSE-NEXT: por %xmm3, %xmm2
8145 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8146 ; SSE-NEXT: movdqa %xmm5, %xmm15
8147 ; SSE-NEXT: movdqa %xmm5, %xmm1
8148 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8149 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8150 ; SSE-NEXT: pand %xmm5, %xmm2
8151 ; SSE-NEXT: por %xmm1, %xmm2
8152 ; SSE-NEXT: movdqa %xmm2, %xmm1
8153 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8154 ; SSE-NEXT: movdqa %xmm14, %xmm3
8155 ; SSE-NEXT: pandn %xmm1, %xmm3
8156 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8157 ; SSE-NEXT: pand %xmm14, %xmm2
8158 ; SSE-NEXT: por %xmm3, %xmm2
8159 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8160 ; SSE-NEXT: movdqa %xmm4, %xmm3
8161 ; SSE-NEXT: pandn %xmm1, %xmm3
8162 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8163 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8164 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8165 ; SSE-NEXT: packuswb %xmm1, %xmm1
8166 ; SSE-NEXT: pand %xmm4, %xmm1
8167 ; SSE-NEXT: movdqa %xmm4, %xmm10
8168 ; SSE-NEXT: por %xmm1, %xmm3
8169 ; SSE-NEXT: movdqa %xmm13, %xmm1
8170 ; SSE-NEXT: pandn %xmm3, %xmm1
8171 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8172 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8173 ; SSE-NEXT: por %xmm2, %xmm12
8174 ; SSE-NEXT: movdqa %xmm12, %xmm2
8175 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8176 ; SSE-NEXT: movdqa %xmm14, %xmm3
8177 ; SSE-NEXT: pandn %xmm2, %xmm3
8178 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
8179 ; SSE-NEXT: pand %xmm14, %xmm12
8180 ; SSE-NEXT: por %xmm3, %xmm12
8181 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8182 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[1,3,2,3]
8183 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8184 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
8185 ; SSE-NEXT: movdqa %xmm4, %xmm2
8186 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8187 ; SSE-NEXT: movdqa %xmm0, %xmm3
8188 ; SSE-NEXT: pandn %xmm2, %xmm3
8189 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8190 ; SSE-NEXT: pand %xmm0, %xmm4
8191 ; SSE-NEXT: por %xmm3, %xmm4
8192 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1]
8193 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8194 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8195 ; SSE-NEXT: packuswb %xmm2, %xmm3
8196 ; SSE-NEXT: movdqa %xmm6, %xmm4
8197 ; SSE-NEXT: pandn %xmm3, %xmm4
8198 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,2,1,3]
8199 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8200 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
8201 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
8202 ; SSE-NEXT: packuswb %xmm2, %xmm2
8203 ; SSE-NEXT: pand %xmm6, %xmm2
8204 ; SSE-NEXT: por %xmm2, %xmm4
8205 ; SSE-NEXT: pand %xmm13, %xmm4
8206 ; SSE-NEXT: por %xmm1, %xmm4
8207 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8208 ; SSE-NEXT: movdqa %xmm5, %xmm1
8209 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8210 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8211 ; SSE-NEXT: pand %xmm5, %xmm2
8212 ; SSE-NEXT: por %xmm1, %xmm2
8213 ; SSE-NEXT: movdqa %xmm2, %xmm1
8214 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8215 ; SSE-NEXT: movdqa %xmm14, %xmm3
8216 ; SSE-NEXT: pandn %xmm1, %xmm3
8217 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8218 ; SSE-NEXT: pand %xmm14, %xmm2
8219 ; SSE-NEXT: por %xmm3, %xmm2
8220 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8221 ; SSE-NEXT: movdqa %xmm10, %xmm3
8222 ; SSE-NEXT: pandn %xmm1, %xmm3
8223 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8224 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8225 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8226 ; SSE-NEXT: packuswb %xmm1, %xmm1
8227 ; SSE-NEXT: pand %xmm10, %xmm1
8228 ; SSE-NEXT: por %xmm1, %xmm3
8229 ; SSE-NEXT: movdqa %xmm13, %xmm1
8230 ; SSE-NEXT: pandn %xmm3, %xmm1
8231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8232 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8233 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8234 ; SSE-NEXT: por %xmm2, %xmm4
8235 ; SSE-NEXT: movdqa %xmm4, %xmm2
8236 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8237 ; SSE-NEXT: movdqa %xmm14, %xmm3
8238 ; SSE-NEXT: pandn %xmm2, %xmm3
8239 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8240 ; SSE-NEXT: pand %xmm14, %xmm4
8241 ; SSE-NEXT: por %xmm3, %xmm4
8242 ; SSE-NEXT: movdqa %xmm4, %xmm5
8243 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8244 ; SSE-NEXT: # xmm2 = mem[1,3,2,3]
8245 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8246 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
8247 ; SSE-NEXT: movdqa %xmm4, %xmm2
8248 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8249 ; SSE-NEXT: movdqa %xmm0, %xmm3
8250 ; SSE-NEXT: pandn %xmm2, %xmm3
8251 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8252 ; SSE-NEXT: pand %xmm0, %xmm4
8253 ; SSE-NEXT: por %xmm3, %xmm4
8254 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,2,1]
8255 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8256 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8257 ; SSE-NEXT: packuswb %xmm2, %xmm3
8258 ; SSE-NEXT: movdqa %xmm6, %xmm4
8259 ; SSE-NEXT: pandn %xmm3, %xmm4
8260 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,1,3]
8261 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8262 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
8263 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,0,3,4,5,6,7]
8264 ; SSE-NEXT: packuswb %xmm2, %xmm2
8265 ; SSE-NEXT: pand %xmm6, %xmm2
8266 ; SSE-NEXT: por %xmm2, %xmm4
8267 ; SSE-NEXT: pand %xmm13, %xmm4
8268 ; SSE-NEXT: por %xmm1, %xmm4
8269 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8270 ; SSE-NEXT: movdqa %xmm15, %xmm1
8271 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8272 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8273 ; SSE-NEXT: pand %xmm15, %xmm2
8274 ; SSE-NEXT: por %xmm1, %xmm2
8275 ; SSE-NEXT: movdqa %xmm2, %xmm1
8276 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8277 ; SSE-NEXT: movdqa %xmm14, %xmm3
8278 ; SSE-NEXT: pandn %xmm1, %xmm3
8279 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8280 ; SSE-NEXT: pand %xmm14, %xmm2
8281 ; SSE-NEXT: por %xmm3, %xmm2
8282 ; SSE-NEXT: packuswb {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8283 ; SSE-NEXT: movdqa %xmm10, %xmm3
8284 ; SSE-NEXT: pandn %xmm1, %xmm3
8285 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
8286 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
8287 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
8288 ; SSE-NEXT: packuswb %xmm1, %xmm1
8289 ; SSE-NEXT: pand %xmm10, %xmm1
8290 ; SSE-NEXT: por %xmm1, %xmm3
8291 ; SSE-NEXT: movdqa %xmm13, %xmm1
8292 ; SSE-NEXT: pandn %xmm3, %xmm1
8293 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8294 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8295 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8296 ; SSE-NEXT: por %xmm2, %xmm4
8297 ; SSE-NEXT: movdqa %xmm4, %xmm2
8298 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8299 ; SSE-NEXT: movdqa %xmm14, %xmm3
8300 ; SSE-NEXT: pandn %xmm2, %xmm3
8301 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
8302 ; SSE-NEXT: pand %xmm14, %xmm4
8303 ; SSE-NEXT: por %xmm3, %xmm4
8304 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8305 ; SSE-NEXT: # xmm2 = mem[1,3,2,3]
8306 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8307 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
8308 ; SSE-NEXT: movdqa %xmm3, %xmm2
8309 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8310 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8311 ; SSE-NEXT: pand %xmm0, %xmm3
8312 ; SSE-NEXT: pandn %xmm2, %xmm0
8313 ; SSE-NEXT: por %xmm3, %xmm0
8314 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
8315 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
8316 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
8317 ; SSE-NEXT: packuswb %xmm2, %xmm0
8318 ; SSE-NEXT: movdqa %xmm6, %xmm2
8319 ; SSE-NEXT: pandn %xmm0, %xmm2
8320 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3]
8321 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
8322 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,3]
8323 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
8324 ; SSE-NEXT: packuswb %xmm0, %xmm0
8325 ; SSE-NEXT: pand %xmm6, %xmm0
8326 ; SSE-NEXT: por %xmm0, %xmm2
8327 ; SSE-NEXT: pand %xmm13, %xmm2
8328 ; SSE-NEXT: por %xmm1, %xmm2
8329 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8330 ; SSE-NEXT: movdqa %xmm15, %xmm9
8331 ; SSE-NEXT: movdqa %xmm15, %xmm0
8332 ; SSE-NEXT: pandn %xmm7, %xmm0
8333 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8334 ; SSE-NEXT: pand %xmm15, %xmm1
8335 ; SSE-NEXT: por %xmm0, %xmm1
8336 ; SSE-NEXT: movdqa %xmm1, %xmm0
8337 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8338 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8339 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8340 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8341 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8342 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8343 ; SSE-NEXT: psrlq $48, %xmm0
8344 ; SSE-NEXT: packuswb %xmm0, %xmm1
8345 ; SSE-NEXT: movdqa %xmm6, %xmm0
8346 ; SSE-NEXT: pandn %xmm1, %xmm0
8347 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,0,65535,65535]
8348 ; SSE-NEXT: movdqa %xmm12, %xmm1
8349 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8350 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8351 ; SSE-NEXT: pand %xmm12, %xmm2
8352 ; SSE-NEXT: por %xmm1, %xmm2
8353 ; SSE-NEXT: movdqa %xmm2, %xmm1
8354 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8355 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,0,65535,65535,65535,65535]
8356 ; SSE-NEXT: movdqa %xmm5, %xmm3
8357 ; SSE-NEXT: pandn %xmm1, %xmm3
8358 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8359 ; SSE-NEXT: pand %xmm5, %xmm2
8360 ; SSE-NEXT: por %xmm3, %xmm2
8361 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8362 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8363 ; SSE-NEXT: packuswb %xmm1, %xmm1
8364 ; SSE-NEXT: pand %xmm6, %xmm1
8365 ; SSE-NEXT: por %xmm0, %xmm1
8366 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8367 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8368 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8369 ; SSE-NEXT: por %xmm0, %xmm3
8370 ; SSE-NEXT: movdqa %xmm3, %xmm0
8371 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8372 ; SSE-NEXT: movdqa %xmm14, %xmm2
8373 ; SSE-NEXT: pandn %xmm0, %xmm2
8374 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8375 ; SSE-NEXT: pand %xmm14, %xmm3
8376 ; SSE-NEXT: por %xmm2, %xmm3
8377 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8378 ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7]
8379 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8380 ; SSE-NEXT: packuswb %xmm0, %xmm0
8381 ; SSE-NEXT: movdqa %xmm10, %xmm7
8382 ; SSE-NEXT: movdqa %xmm10, %xmm2
8383 ; SSE-NEXT: pandn %xmm0, %xmm2
8384 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8385 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8386 ; SSE-NEXT: packuswb %xmm0, %xmm0
8387 ; SSE-NEXT: pand %xmm10, %xmm0
8388 ; SSE-NEXT: por %xmm0, %xmm2
8389 ; SSE-NEXT: movdqa %xmm13, %xmm10
8390 ; SSE-NEXT: movdqa %xmm13, %xmm0
8391 ; SSE-NEXT: pandn %xmm2, %xmm0
8392 ; SSE-NEXT: pand %xmm13, %xmm1
8393 ; SSE-NEXT: por %xmm1, %xmm0
8394 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8395 ; SSE-NEXT: movdqa %xmm15, %xmm0
8396 ; SSE-NEXT: pandn %xmm11, %xmm0
8397 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8398 ; SSE-NEXT: pand %xmm15, %xmm1
8399 ; SSE-NEXT: por %xmm0, %xmm1
8400 ; SSE-NEXT: movdqa %xmm1, %xmm0
8401 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8402 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8403 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8404 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8405 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8406 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8407 ; SSE-NEXT: psrlq $48, %xmm0
8408 ; SSE-NEXT: packuswb %xmm0, %xmm1
8409 ; SSE-NEXT: movdqa %xmm6, %xmm0
8410 ; SSE-NEXT: pandn %xmm1, %xmm0
8411 ; SSE-NEXT: movdqa %xmm12, %xmm1
8412 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8413 ; SSE-NEXT: pandn %xmm15, %xmm1
8414 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8415 ; SSE-NEXT: movdqa %xmm4, %xmm2
8416 ; SSE-NEXT: pand %xmm12, %xmm2
8417 ; SSE-NEXT: por %xmm1, %xmm2
8418 ; SSE-NEXT: movdqa %xmm2, %xmm1
8419 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8420 ; SSE-NEXT: movdqa %xmm5, %xmm3
8421 ; SSE-NEXT: pandn %xmm1, %xmm3
8422 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8423 ; SSE-NEXT: pand %xmm5, %xmm2
8424 ; SSE-NEXT: por %xmm3, %xmm2
8425 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8426 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8427 ; SSE-NEXT: packuswb %xmm1, %xmm1
8428 ; SSE-NEXT: pand %xmm6, %xmm1
8429 ; SSE-NEXT: por %xmm0, %xmm1
8430 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8431 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8432 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8433 ; SSE-NEXT: por %xmm0, %xmm3
8434 ; SSE-NEXT: movdqa %xmm3, %xmm0
8435 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8436 ; SSE-NEXT: movdqa %xmm14, %xmm2
8437 ; SSE-NEXT: pandn %xmm0, %xmm2
8438 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8439 ; SSE-NEXT: pand %xmm14, %xmm3
8440 ; SSE-NEXT: por %xmm2, %xmm3
8441 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8442 ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7]
8443 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8444 ; SSE-NEXT: packuswb %xmm0, %xmm0
8445 ; SSE-NEXT: movdqa %xmm7, %xmm2
8446 ; SSE-NEXT: pandn %xmm0, %xmm2
8447 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8448 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8449 ; SSE-NEXT: packuswb %xmm0, %xmm0
8450 ; SSE-NEXT: pand %xmm7, %xmm0
8451 ; SSE-NEXT: por %xmm0, %xmm2
8452 ; SSE-NEXT: movdqa %xmm13, %xmm0
8453 ; SSE-NEXT: pandn %xmm2, %xmm0
8454 ; SSE-NEXT: pand %xmm13, %xmm1
8455 ; SSE-NEXT: por %xmm1, %xmm0
8456 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8457 ; SSE-NEXT: movdqa %xmm9, %xmm0
8458 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8459 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8460 ; SSE-NEXT: pand %xmm9, %xmm1
8461 ; SSE-NEXT: movdqa %xmm9, %xmm13
8462 ; SSE-NEXT: por %xmm0, %xmm1
8463 ; SSE-NEXT: movdqa %xmm1, %xmm0
8464 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8465 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8466 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
8467 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
8468 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8469 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8470 ; SSE-NEXT: psrlq $48, %xmm0
8471 ; SSE-NEXT: packuswb %xmm0, %xmm1
8472 ; SSE-NEXT: movdqa %xmm6, %xmm0
8473 ; SSE-NEXT: pandn %xmm1, %xmm0
8474 ; SSE-NEXT: movdqa %xmm12, %xmm1
8475 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8476 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8477 ; SSE-NEXT: movdqa %xmm9, %xmm2
8478 ; SSE-NEXT: pand %xmm12, %xmm2
8479 ; SSE-NEXT: por %xmm1, %xmm2
8480 ; SSE-NEXT: movdqa %xmm2, %xmm1
8481 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
8482 ; SSE-NEXT: movdqa %xmm5, %xmm3
8483 ; SSE-NEXT: pandn %xmm1, %xmm3
8484 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8485 ; SSE-NEXT: pand %xmm5, %xmm2
8486 ; SSE-NEXT: por %xmm3, %xmm2
8487 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,2,1,0,4,5,6,7]
8488 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
8489 ; SSE-NEXT: packuswb %xmm1, %xmm1
8490 ; SSE-NEXT: pand %xmm6, %xmm1
8491 ; SSE-NEXT: por %xmm0, %xmm1
8492 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8493 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8494 ; SSE-NEXT: pandn %xmm11, %xmm0
8495 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8496 ; SSE-NEXT: por %xmm0, %xmm3
8497 ; SSE-NEXT: movdqa %xmm3, %xmm0
8498 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8499 ; SSE-NEXT: movdqa %xmm14, %xmm2
8500 ; SSE-NEXT: pandn %xmm0, %xmm2
8501 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8502 ; SSE-NEXT: pand %xmm14, %xmm3
8503 ; SSE-NEXT: por %xmm2, %xmm3
8504 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8505 ; SSE-NEXT: # xmm0 = mem[2,1,2,3,4,5,6,7]
8506 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8507 ; SSE-NEXT: packuswb %xmm0, %xmm0
8508 ; SSE-NEXT: movdqa %xmm7, %xmm2
8509 ; SSE-NEXT: pandn %xmm0, %xmm2
8510 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,1,0,4,5,6,7]
8511 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
8512 ; SSE-NEXT: packuswb %xmm0, %xmm0
8513 ; SSE-NEXT: pand %xmm7, %xmm0
8514 ; SSE-NEXT: por %xmm0, %xmm2
8515 ; SSE-NEXT: movdqa %xmm10, %xmm0
8516 ; SSE-NEXT: pandn %xmm2, %xmm0
8517 ; SSE-NEXT: pand %xmm10, %xmm1
8518 ; SSE-NEXT: por %xmm1, %xmm0
8519 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8520 ; SSE-NEXT: movdqa %xmm13, %xmm0
8521 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8522 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8523 ; SSE-NEXT: pand %xmm13, %xmm2
8524 ; SSE-NEXT: por %xmm0, %xmm2
8525 ; SSE-NEXT: movdqa %xmm2, %xmm0
8526 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
8527 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
8528 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
8529 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
8530 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
8531 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
8532 ; SSE-NEXT: psrlq $48, %xmm0
8533 ; SSE-NEXT: packuswb %xmm0, %xmm1
8534 ; SSE-NEXT: movdqa %xmm12, %xmm0
8535 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8536 ; SSE-NEXT: pandn %xmm13, %xmm0
8537 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8538 ; SSE-NEXT: pand %xmm12, %xmm2
8539 ; SSE-NEXT: por %xmm0, %xmm2
8540 ; SSE-NEXT: movdqa %xmm2, %xmm0
8541 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
8542 ; SSE-NEXT: movdqa %xmm5, %xmm3
8543 ; SSE-NEXT: pandn %xmm0, %xmm3
8544 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
8545 ; SSE-NEXT: pand %xmm5, %xmm2
8546 ; SSE-NEXT: por %xmm3, %xmm2
8547 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,2,1,0,4,5,6,7]
8548 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
8549 ; SSE-NEXT: packuswb %xmm0, %xmm0
8550 ; SSE-NEXT: pand %xmm6, %xmm0
8551 ; SSE-NEXT: pandn %xmm1, %xmm6
8552 ; SSE-NEXT: por %xmm6, %xmm0
8553 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8554 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8555 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8556 ; SSE-NEXT: por %xmm1, %xmm3
8557 ; SSE-NEXT: movdqa %xmm3, %xmm1
8558 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
8559 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8560 ; SSE-NEXT: pand %xmm14, %xmm3
8561 ; SSE-NEXT: pandn %xmm1, %xmm14
8562 ; SSE-NEXT: por %xmm3, %xmm14
8563 ; SSE-NEXT: pshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8564 ; SSE-NEXT: # xmm1 = mem[2,1,2,3,4,5,6,7]
8565 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8566 ; SSE-NEXT: packuswb %xmm1, %xmm1
8567 ; SSE-NEXT: movdqa %xmm7, %xmm2
8568 ; SSE-NEXT: pandn %xmm1, %xmm2
8569 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[0,2,1,0,4,5,6,7]
8570 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
8571 ; SSE-NEXT: packuswb %xmm1, %xmm1
8572 ; SSE-NEXT: pand %xmm7, %xmm1
8573 ; SSE-NEXT: por %xmm1, %xmm2
8574 ; SSE-NEXT: movdqa %xmm10, %xmm1
8575 ; SSE-NEXT: pandn %xmm2, %xmm1
8576 ; SSE-NEXT: pand %xmm10, %xmm0
8577 ; SSE-NEXT: por %xmm0, %xmm1
8578 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8579 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
8580 ; SSE-NEXT: movdqa %xmm8, %xmm0
8581 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8582 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8583 ; SSE-NEXT: pand %xmm8, %xmm1
8584 ; SSE-NEXT: por %xmm0, %xmm1
8585 ; SSE-NEXT: movdqa %xmm1, %xmm0
8586 ; SSE-NEXT: pxor %xmm2, %xmm2
8587 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
8588 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8589 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8590 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
8591 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
8592 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8593 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8594 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
8595 ; SSE-NEXT: pxor %xmm6, %xmm6
8596 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8597 ; SSE-NEXT: pandn %xmm0, %xmm3
8598 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8599 ; SSE-NEXT: por %xmm3, %xmm2
8600 ; SSE-NEXT: packuswb %xmm0, %xmm2
8601 ; SSE-NEXT: packuswb %xmm1, %xmm1
8602 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
8603 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
8604 ; SSE-NEXT: movdqa %xmm12, %xmm1
8605 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8606 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8607 ; SSE-NEXT: movdqa %xmm14, %xmm2
8608 ; SSE-NEXT: pand %xmm12, %xmm2
8609 ; SSE-NEXT: por %xmm1, %xmm2
8610 ; SSE-NEXT: movdqa %xmm2, %xmm1
8611 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
8612 ; SSE-NEXT: movdqa %xmm5, %xmm3
8613 ; SSE-NEXT: pandn %xmm1, %xmm3
8614 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
8615 ; SSE-NEXT: pand %xmm5, %xmm2
8616 ; SSE-NEXT: por %xmm3, %xmm2
8617 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8618 ; SSE-NEXT: # xmm1 = mem[0,1,2,1]
8619 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
8620 ; SSE-NEXT: packuswb %xmm1, %xmm1
8621 ; SSE-NEXT: movdqa %xmm7, %xmm3
8622 ; SSE-NEXT: pandn %xmm1, %xmm3
8623 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
8624 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
8625 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
8626 ; SSE-NEXT: packuswb %xmm1, %xmm1
8627 ; SSE-NEXT: pand %xmm7, %xmm1
8628 ; SSE-NEXT: por %xmm1, %xmm3
8629 ; SSE-NEXT: movdqa %xmm10, %xmm1
8630 ; SSE-NEXT: pandn %xmm3, %xmm1
8631 ; SSE-NEXT: andps %xmm10, %xmm0
8632 ; SSE-NEXT: por %xmm0, %xmm1
8633 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8634 ; SSE-NEXT: movdqa %xmm8, %xmm0
8635 ; SSE-NEXT: pandn %xmm15, %xmm0
8636 ; SSE-NEXT: pand %xmm8, %xmm4
8637 ; SSE-NEXT: por %xmm0, %xmm4
8638 ; SSE-NEXT: movdqa %xmm4, %xmm0
8639 ; SSE-NEXT: pxor %xmm1, %xmm1
8640 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8641 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8642 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8643 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,1,2,3]
8644 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
8645 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8646 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8647 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8648 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8649 ; SSE-NEXT: pandn %xmm0, %xmm4
8650 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8651 ; SSE-NEXT: por %xmm4, %xmm3
8652 ; SSE-NEXT: packuswb %xmm0, %xmm3
8653 ; SSE-NEXT: packuswb %xmm2, %xmm2
8654 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3]
8655 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
8656 ; SSE-NEXT: movdqa %xmm12, %xmm2
8657 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8658 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8659 ; SSE-NEXT: movdqa %xmm15, %xmm3
8660 ; SSE-NEXT: pand %xmm12, %xmm3
8661 ; SSE-NEXT: por %xmm2, %xmm3
8662 ; SSE-NEXT: movdqa %xmm3, %xmm2
8663 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8664 ; SSE-NEXT: movdqa %xmm5, %xmm4
8665 ; SSE-NEXT: pandn %xmm2, %xmm4
8666 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
8667 ; SSE-NEXT: pand %xmm5, %xmm3
8668 ; SSE-NEXT: por %xmm4, %xmm3
8669 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8670 ; SSE-NEXT: # xmm2 = mem[0,1,2,1]
8671 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
8672 ; SSE-NEXT: packuswb %xmm2, %xmm2
8673 ; SSE-NEXT: movdqa %xmm7, %xmm4
8674 ; SSE-NEXT: pandn %xmm2, %xmm4
8675 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
8676 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
8677 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
8678 ; SSE-NEXT: packuswb %xmm2, %xmm2
8679 ; SSE-NEXT: pand %xmm7, %xmm2
8680 ; SSE-NEXT: por %xmm2, %xmm4
8681 ; SSE-NEXT: movdqa %xmm10, %xmm1
8682 ; SSE-NEXT: pandn %xmm4, %xmm1
8683 ; SSE-NEXT: andps %xmm10, %xmm0
8684 ; SSE-NEXT: por %xmm0, %xmm1
8685 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8686 ; SSE-NEXT: movdqa %xmm8, %xmm0
8687 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8688 ; SSE-NEXT: pand %xmm8, %xmm9
8689 ; SSE-NEXT: por %xmm0, %xmm9
8690 ; SSE-NEXT: movdqa %xmm9, %xmm0
8691 ; SSE-NEXT: pxor %xmm1, %xmm1
8692 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8693 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8694 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
8695 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,1,2,3]
8696 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
8697 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8698 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8699 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8700 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8701 ; SSE-NEXT: pandn %xmm0, %xmm4
8702 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8703 ; SSE-NEXT: por %xmm4, %xmm2
8704 ; SSE-NEXT: packuswb %xmm0, %xmm2
8705 ; SSE-NEXT: packuswb %xmm3, %xmm3
8706 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,3,3]
8707 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3]
8708 ; SSE-NEXT: movdqa %xmm12, %xmm3
8709 ; SSE-NEXT: pandn %xmm11, %xmm3
8710 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8711 ; SSE-NEXT: movdqa %xmm2, %xmm4
8712 ; SSE-NEXT: pand %xmm12, %xmm4
8713 ; SSE-NEXT: por %xmm3, %xmm4
8714 ; SSE-NEXT: movdqa %xmm4, %xmm3
8715 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
8716 ; SSE-NEXT: movdqa %xmm5, %xmm6
8717 ; SSE-NEXT: pandn %xmm3, %xmm6
8718 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
8719 ; SSE-NEXT: pand %xmm5, %xmm4
8720 ; SSE-NEXT: por %xmm6, %xmm4
8721 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8722 ; SSE-NEXT: # xmm3 = mem[0,1,2,1]
8723 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
8724 ; SSE-NEXT: packuswb %xmm3, %xmm3
8725 ; SSE-NEXT: movdqa %xmm7, %xmm6
8726 ; SSE-NEXT: pandn %xmm3, %xmm6
8727 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
8728 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
8729 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
8730 ; SSE-NEXT: packuswb %xmm3, %xmm3
8731 ; SSE-NEXT: pand %xmm7, %xmm3
8732 ; SSE-NEXT: por %xmm3, %xmm6
8733 ; SSE-NEXT: movdqa %xmm10, %xmm1
8734 ; SSE-NEXT: pandn %xmm6, %xmm1
8735 ; SSE-NEXT: andps %xmm10, %xmm0
8736 ; SSE-NEXT: por %xmm0, %xmm1
8737 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8738 ; SSE-NEXT: movdqa %xmm8, %xmm0
8739 ; SSE-NEXT: pandn %xmm13, %xmm0
8740 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8741 ; SSE-NEXT: pand %xmm8, %xmm4
8742 ; SSE-NEXT: por %xmm0, %xmm4
8743 ; SSE-NEXT: movdqa %xmm4, %xmm0
8744 ; SSE-NEXT: pxor %xmm1, %xmm1
8745 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8746 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
8747 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8748 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
8749 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
8750 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8751 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8752 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
8753 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8754 ; SSE-NEXT: pandn %xmm0, %xmm6
8755 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8756 ; SSE-NEXT: por %xmm6, %xmm3
8757 ; SSE-NEXT: packuswb %xmm0, %xmm3
8758 ; SSE-NEXT: packuswb %xmm4, %xmm4
8759 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,3,3]
8760 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
8761 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8762 ; SSE-NEXT: movdqa %xmm12, %xmm3
8763 ; SSE-NEXT: pand %xmm12, %xmm4
8764 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8765 ; SSE-NEXT: por %xmm4, %xmm3
8766 ; SSE-NEXT: movdqa %xmm3, %xmm4
8767 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
8768 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
8769 ; SSE-NEXT: pxor %xmm12, %xmm12
8770 ; SSE-NEXT: pand %xmm5, %xmm3
8771 ; SSE-NEXT: pandn %xmm4, %xmm5
8772 ; SSE-NEXT: por %xmm3, %xmm5
8773 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,3]
8774 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
8775 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
8776 ; SSE-NEXT: packuswb %xmm4, %xmm4
8777 ; SSE-NEXT: pand %xmm7, %xmm4
8778 ; SSE-NEXT: pshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8779 ; SSE-NEXT: # xmm5 = mem[0,1,2,1]
8780 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
8781 ; SSE-NEXT: packuswb %xmm5, %xmm5
8782 ; SSE-NEXT: pandn %xmm5, %xmm7
8783 ; SSE-NEXT: por %xmm4, %xmm7
8784 ; SSE-NEXT: movdqa %xmm10, %xmm3
8785 ; SSE-NEXT: pandn %xmm7, %xmm3
8786 ; SSE-NEXT: andps %xmm10, %xmm0
8787 ; SSE-NEXT: por %xmm0, %xmm3
8788 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8789 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8790 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,65535,65535,0,65535]
8791 ; SSE-NEXT: pand %xmm13, %xmm4
8792 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8793 ; SSE-NEXT: movdqa %xmm4, %xmm6
8794 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
8795 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,65535,0]
8796 ; SSE-NEXT: movdqa %xmm0, %xmm7
8797 ; SSE-NEXT: pandn %xmm6, %xmm7
8798 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
8799 ; SSE-NEXT: pand %xmm0, %xmm4
8800 ; SSE-NEXT: por %xmm7, %xmm4
8801 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
8802 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,5,4,7,6]
8803 ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8804 ; SSE-NEXT: packuswb %xmm6, %xmm7
8805 ; SSE-NEXT: movdqa %xmm13, %xmm3
8806 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8807 ; SSE-NEXT: pandn %xmm1, %xmm3
8808 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8809 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,3,2,3]
8810 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8811 ; SSE-NEXT: # xmm6 = mem[0,2,2,3]
8812 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
8813 ; SSE-NEXT: movdqa %xmm6, %xmm4
8814 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm12[8],xmm4[9],xmm12[9],xmm4[10],xmm12[10],xmm4[11],xmm12[11],xmm4[12],xmm12[12],xmm4[13],xmm12[13],xmm4[14],xmm12[14],xmm4[15],xmm12[15]
8815 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
8816 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
8817 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
8818 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
8819 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
8820 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
8821 ; SSE-NEXT: packuswb %xmm6, %xmm6
8822 ; SSE-NEXT: movss {{.*#+}} xmm7 = xmm6[0],xmm7[1,2,3]
8823 ; SSE-NEXT: movdqa %xmm8, %xmm1
8824 ; SSE-NEXT: movdqa %xmm8, %xmm4
8825 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8826 ; SSE-NEXT: movdqa %xmm14, %xmm8
8827 ; SSE-NEXT: pand %xmm1, %xmm8
8828 ; SSE-NEXT: movdqa %xmm1, %xmm14
8829 ; SSE-NEXT: por %xmm4, %xmm8
8830 ; SSE-NEXT: movdqa %xmm8, %xmm4
8831 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
8832 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,0,65535,65535,65535]
8833 ; SSE-NEXT: movdqa %xmm1, %xmm6
8834 ; SSE-NEXT: pandn %xmm4, %xmm6
8835 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8836 ; SSE-NEXT: pand %xmm1, %xmm8
8837 ; SSE-NEXT: por %xmm6, %xmm8
8838 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8839 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8840 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
8841 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
8842 ; SSE-NEXT: packuswb %xmm4, %xmm4
8843 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
8844 ; SSE-NEXT: movdqa %xmm6, %xmm9
8845 ; SSE-NEXT: pandn %xmm4, %xmm9
8846 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,0,3]
8847 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,3,2,4,5,6,7]
8848 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
8849 ; SSE-NEXT: packuswb %xmm4, %xmm4
8850 ; SSE-NEXT: pand %xmm6, %xmm4
8851 ; SSE-NEXT: por %xmm4, %xmm9
8852 ; SSE-NEXT: movdqa %xmm10, %xmm3
8853 ; SSE-NEXT: pandn %xmm9, %xmm3
8854 ; SSE-NEXT: andps %xmm10, %xmm7
8855 ; SSE-NEXT: movdqa %xmm10, %xmm5
8856 ; SSE-NEXT: por %xmm7, %xmm3
8857 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8858 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8859 ; SSE-NEXT: movdqa %xmm13, %xmm10
8860 ; SSE-NEXT: pand %xmm13, %xmm7
8861 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8862 ; SSE-NEXT: movdqa %xmm7, %xmm8
8863 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8864 ; SSE-NEXT: movdqa %xmm0, %xmm9
8865 ; SSE-NEXT: pandn %xmm8, %xmm9
8866 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
8867 ; SSE-NEXT: pand %xmm0, %xmm7
8868 ; SSE-NEXT: por %xmm9, %xmm7
8869 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
8870 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm7[0,1,2,3,5,4,7,6]
8871 ; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8872 ; SSE-NEXT: packuswb %xmm8, %xmm9
8873 ; SSE-NEXT: movdqa %xmm13, %xmm4
8874 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8875 ; SSE-NEXT: pandn %xmm3, %xmm4
8876 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8877 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,3,2,3]
8878 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8879 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
8880 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
8881 ; SSE-NEXT: movdqa %xmm8, %xmm7
8882 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15]
8883 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
8884 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
8885 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8886 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
8887 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
8888 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
8889 ; SSE-NEXT: packuswb %xmm8, %xmm8
8890 ; SSE-NEXT: movss {{.*#+}} xmm9 = xmm8[0],xmm9[1,2,3]
8891 ; SSE-NEXT: movdqa %xmm14, %xmm7
8892 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8893 ; SSE-NEXT: movdqa %xmm15, %xmm8
8894 ; SSE-NEXT: pand %xmm14, %xmm8
8895 ; SSE-NEXT: por %xmm7, %xmm8
8896 ; SSE-NEXT: movdqa %xmm8, %xmm7
8897 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
8898 ; SSE-NEXT: movdqa %xmm1, %xmm13
8899 ; SSE-NEXT: pandn %xmm7, %xmm13
8900 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8901 ; SSE-NEXT: pand %xmm1, %xmm8
8902 ; SSE-NEXT: por %xmm13, %xmm8
8903 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8904 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8905 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
8906 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
8907 ; SSE-NEXT: packuswb %xmm7, %xmm7
8908 ; SSE-NEXT: movdqa %xmm6, %xmm13
8909 ; SSE-NEXT: pandn %xmm7, %xmm13
8910 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,0,3]
8911 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,0,3,2,4,5,6,7]
8912 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
8913 ; SSE-NEXT: packuswb %xmm7, %xmm7
8914 ; SSE-NEXT: pand %xmm6, %xmm7
8915 ; SSE-NEXT: por %xmm7, %xmm13
8916 ; SSE-NEXT: movdqa %xmm5, %xmm7
8917 ; SSE-NEXT: pandn %xmm13, %xmm7
8918 ; SSE-NEXT: andps %xmm5, %xmm9
8919 ; SSE-NEXT: por %xmm9, %xmm7
8920 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8921 ; SSE-NEXT: pand %xmm10, %xmm8
8922 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8923 ; SSE-NEXT: movdqa %xmm8, %xmm9
8924 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8925 ; SSE-NEXT: movdqa %xmm0, %xmm13
8926 ; SSE-NEXT: pandn %xmm9, %xmm13
8927 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8928 ; SSE-NEXT: pand %xmm0, %xmm8
8929 ; SSE-NEXT: por %xmm13, %xmm8
8930 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
8931 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm8[0,1,2,3,5,4,7,6]
8932 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8933 ; SSE-NEXT: packuswb %xmm9, %xmm15
8934 ; SSE-NEXT: movdqa %xmm10, %xmm13
8935 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8936 ; SSE-NEXT: pandn %xmm3, %xmm13
8937 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[1,3,2,3]
8938 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8939 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
8940 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
8941 ; SSE-NEXT: movdqa %xmm9, %xmm8
8942 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15]
8943 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
8944 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
8945 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
8946 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
8947 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
8948 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8949 ; SSE-NEXT: packuswb %xmm9, %xmm9
8950 ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm9[0],xmm15[1,2,3]
8951 ; SSE-NEXT: movdqa %xmm14, %xmm8
8952 ; SSE-NEXT: pandn %xmm11, %xmm8
8953 ; SSE-NEXT: movdqa %xmm2, %xmm9
8954 ; SSE-NEXT: pand %xmm14, %xmm9
8955 ; SSE-NEXT: por %xmm8, %xmm9
8956 ; SSE-NEXT: movdqa %xmm9, %xmm8
8957 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8958 ; SSE-NEXT: movdqa %xmm1, %xmm11
8959 ; SSE-NEXT: pandn %xmm8, %xmm11
8960 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
8961 ; SSE-NEXT: pand %xmm1, %xmm9
8962 ; SSE-NEXT: por %xmm11, %xmm9
8963 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8964 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8965 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3]
8966 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
8967 ; SSE-NEXT: packuswb %xmm8, %xmm8
8968 ; SSE-NEXT: movdqa %xmm6, %xmm11
8969 ; SSE-NEXT: pandn %xmm8, %xmm11
8970 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm9[2,1,0,3]
8971 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,0,3,2,4,5,6,7]
8972 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
8973 ; SSE-NEXT: packuswb %xmm8, %xmm8
8974 ; SSE-NEXT: pand %xmm6, %xmm8
8975 ; SSE-NEXT: por %xmm8, %xmm11
8976 ; SSE-NEXT: movdqa %xmm5, %xmm9
8977 ; SSE-NEXT: pandn %xmm11, %xmm9
8978 ; SSE-NEXT: andps %xmm5, %xmm15
8979 ; SSE-NEXT: por %xmm15, %xmm9
8980 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8981 ; SSE-NEXT: pand %xmm10, %xmm8
8982 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8983 ; SSE-NEXT: movdqa %xmm8, %xmm11
8984 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
8985 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
8986 ; SSE-NEXT: pand %xmm0, %xmm8
8987 ; SSE-NEXT: pandn %xmm11, %xmm0
8988 ; SSE-NEXT: por %xmm8, %xmm0
8989 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8990 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
8991 ; SSE-NEXT: psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8992 ; SSE-NEXT: packuswb %xmm11, %xmm0
8993 ; SSE-NEXT: movdqa %xmm10, %xmm2
8994 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8995 ; SSE-NEXT: pand %xmm10, %xmm15
8996 ; SSE-NEXT: pand %xmm10, %xmm4
8997 ; SSE-NEXT: pand %xmm10, %xmm3
8998 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8999 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9000 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
9001 ; SSE-NEXT: pand %xmm10, %xmm3
9002 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9003 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9004 ; SSE-NEXT: pandn %xmm3, %xmm2
9005 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[1,3,2,3]
9006 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1]
9007 ; SSE-NEXT: movdqa %xmm8, %xmm11
9008 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
9009 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
9010 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
9011 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
9012 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
9013 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
9014 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
9015 ; SSE-NEXT: packuswb %xmm8, %xmm8
9016 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm8[0],xmm0[1,2,3]
9017 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9018 ; SSE-NEXT: movdqa %xmm14, %xmm3
9019 ; SSE-NEXT: pand %xmm14, %xmm8
9020 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9021 ; SSE-NEXT: por %xmm8, %xmm3
9022 ; SSE-NEXT: movdqa %xmm3, %xmm8
9023 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
9024 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
9025 ; SSE-NEXT: pand %xmm1, %xmm3
9026 ; SSE-NEXT: pandn %xmm8, %xmm1
9027 ; SSE-NEXT: por %xmm3, %xmm1
9028 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9029 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9030 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm3[0,1,0,3]
9031 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
9032 ; SSE-NEXT: packuswb %xmm8, %xmm8
9033 ; SSE-NEXT: movdqa %xmm6, %xmm14
9034 ; SSE-NEXT: pandn %xmm8, %xmm14
9035 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
9036 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
9037 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
9038 ; SSE-NEXT: packuswb %xmm1, %xmm1
9039 ; SSE-NEXT: pand %xmm6, %xmm1
9040 ; SSE-NEXT: por %xmm1, %xmm14
9041 ; SSE-NEXT: movdqa %xmm5, %xmm11
9042 ; SSE-NEXT: pandn %xmm14, %xmm11
9043 ; SSE-NEXT: andps %xmm5, %xmm0
9044 ; SSE-NEXT: por %xmm0, %xmm11
9045 ; SSE-NEXT: movdqa %xmm15, %xmm1
9046 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9047 ; SSE-NEXT: movdqa %xmm1, %xmm0
9048 ; SSE-NEXT: pxor %xmm3, %xmm3
9049 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
9050 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
9051 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
9052 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9053 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
9054 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9055 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9056 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
9057 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9058 ; SSE-NEXT: pandn %xmm1, %xmm10
9059 ; SSE-NEXT: movdqa %xmm1, %xmm8
9060 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9061 ; SSE-NEXT: por %xmm10, %xmm1
9062 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
9063 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
9064 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
9065 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
9066 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
9067 ; SSE-NEXT: packuswb %xmm8, %xmm1
9068 ; SSE-NEXT: packuswb %xmm0, %xmm0
9069 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
9070 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9071 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9072 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
9073 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
9074 ; SSE-NEXT: packuswb %xmm0, %xmm0
9075 ; SSE-NEXT: movdqa %xmm6, %xmm8
9076 ; SSE-NEXT: pandn %xmm0, %xmm8
9077 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9078 ; SSE-NEXT: # xmm0 = mem[1,3,2,3]
9079 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9080 ; SSE-NEXT: # xmm14 = mem[0,2,2,3]
9081 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
9082 ; SSE-NEXT: movdqa %xmm14, %xmm0
9083 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
9084 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,0,65535,65535]
9085 ; SSE-NEXT: movdqa %xmm10, %xmm15
9086 ; SSE-NEXT: pandn %xmm0, %xmm15
9087 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
9088 ; SSE-NEXT: pand %xmm10, %xmm14
9089 ; SSE-NEXT: por %xmm15, %xmm14
9090 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,1,1,1]
9091 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
9092 ; SSE-NEXT: packuswb %xmm0, %xmm14
9093 ; SSE-NEXT: pand %xmm6, %xmm14
9094 ; SSE-NEXT: por %xmm8, %xmm14
9095 ; SSE-NEXT: movdqa %xmm5, %xmm3
9096 ; SSE-NEXT: pandn %xmm14, %xmm3
9097 ; SSE-NEXT: andps %xmm5, %xmm1
9098 ; SSE-NEXT: por %xmm1, %xmm3
9099 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9100 ; SSE-NEXT: movdqa %xmm4, %xmm1
9101 ; SSE-NEXT: pxor %xmm0, %xmm0
9102 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9103 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9104 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
9105 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
9106 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9107 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9108 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9109 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
9110 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9111 ; SSE-NEXT: pandn %xmm4, %xmm12
9112 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload
9113 ; SSE-NEXT: por %xmm12, %xmm8
9114 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
9115 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9116 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9117 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
9118 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm4[3,3,3,3]
9119 ; SSE-NEXT: packuswb %xmm12, %xmm8
9120 ; SSE-NEXT: packuswb %xmm1, %xmm1
9121 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3]
9122 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9123 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9124 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
9125 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9126 ; SSE-NEXT: packuswb %xmm1, %xmm1
9127 ; SSE-NEXT: movdqa %xmm6, %xmm12
9128 ; SSE-NEXT: pandn %xmm1, %xmm12
9129 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9130 ; SSE-NEXT: # xmm1 = mem[1,3,2,3]
9131 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9132 ; SSE-NEXT: # xmm14 = mem[0,2,2,3]
9133 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
9134 ; SSE-NEXT: movdqa %xmm14, %xmm1
9135 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9136 ; SSE-NEXT: movdqa %xmm10, %xmm15
9137 ; SSE-NEXT: pandn %xmm1, %xmm15
9138 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
9139 ; SSE-NEXT: pand %xmm10, %xmm14
9140 ; SSE-NEXT: por %xmm15, %xmm14
9141 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1]
9142 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9143 ; SSE-NEXT: packuswb %xmm1, %xmm1
9144 ; SSE-NEXT: pand %xmm6, %xmm1
9145 ; SSE-NEXT: por %xmm12, %xmm1
9146 ; SSE-NEXT: movdqa %xmm5, %xmm12
9147 ; SSE-NEXT: pandn %xmm1, %xmm12
9148 ; SSE-NEXT: andps %xmm5, %xmm8
9149 ; SSE-NEXT: movdqa %xmm5, %xmm4
9150 ; SSE-NEXT: por %xmm8, %xmm12
9151 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9152 ; SSE-NEXT: por %xmm13, %xmm0
9153 ; SSE-NEXT: movdqa %xmm0, %xmm1
9154 ; SSE-NEXT: pxor %xmm13, %xmm13
9155 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
9156 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9157 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
9158 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,1,2,1]
9159 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9160 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9161 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9162 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
9163 ; SSE-NEXT: pxor %xmm0, %xmm0
9164 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9165 ; SSE-NEXT: pandn %xmm5, %xmm13
9166 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9167 ; SSE-NEXT: por %xmm13, %xmm8
9168 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,1,3]
9169 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9170 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9171 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
9172 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm5[3,3,3,3]
9173 ; SSE-NEXT: packuswb %xmm13, %xmm8
9174 ; SSE-NEXT: packuswb %xmm1, %xmm1
9175 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm1[0],xmm8[1,2,3]
9176 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9177 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9178 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
9179 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
9180 ; SSE-NEXT: packuswb %xmm1, %xmm1
9181 ; SSE-NEXT: movdqa %xmm6, %xmm13
9182 ; SSE-NEXT: pandn %xmm1, %xmm13
9183 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9184 ; SSE-NEXT: # xmm1 = mem[1,3,2,3]
9185 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9186 ; SSE-NEXT: # xmm14 = mem[0,2,2,3]
9187 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
9188 ; SSE-NEXT: movdqa %xmm14, %xmm1
9189 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9190 ; SSE-NEXT: movdqa %xmm10, %xmm15
9191 ; SSE-NEXT: pandn %xmm1, %xmm15
9192 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
9193 ; SSE-NEXT: pand %xmm10, %xmm14
9194 ; SSE-NEXT: por %xmm15, %xmm14
9195 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,1,1,1]
9196 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9197 ; SSE-NEXT: packuswb %xmm1, %xmm1
9198 ; SSE-NEXT: pand %xmm6, %xmm1
9199 ; SSE-NEXT: por %xmm13, %xmm1
9200 ; SSE-NEXT: movdqa %xmm4, %xmm0
9201 ; SSE-NEXT: movdqa %xmm4, %xmm13
9202 ; SSE-NEXT: pandn %xmm1, %xmm13
9203 ; SSE-NEXT: andps %xmm4, %xmm8
9204 ; SSE-NEXT: por %xmm8, %xmm13
9205 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9206 ; SSE-NEXT: movdqa %xmm2, %xmm1
9207 ; SSE-NEXT: pxor %xmm14, %xmm14
9208 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
9209 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
9210 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
9211 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
9212 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
9213 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
9214 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9215 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
9216 ; SSE-NEXT: pxor %xmm15, %xmm15
9217 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9218 ; SSE-NEXT: pandn %xmm2, %xmm5
9219 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9220 ; SSE-NEXT: por %xmm5, %xmm4
9221 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,1,3]
9222 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
9223 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
9224 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,7,6,5]
9225 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,3,3,3]
9226 ; SSE-NEXT: packuswb %xmm8, %xmm14
9227 ; SSE-NEXT: packuswb %xmm1, %xmm1
9228 ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm1[0],xmm14[1,2,3]
9229 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9230 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9231 ; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9232 ; SSE-NEXT: # xmm1 = mem[1,3,2,3]
9233 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9234 ; SSE-NEXT: # xmm8 = mem[0,2,2,3]
9235 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
9236 ; SSE-NEXT: movdqa %xmm8, %xmm1
9237 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
9238 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm15[8],xmm8[9],xmm15[9],xmm8[10],xmm15[10],xmm8[11],xmm15[11],xmm8[12],xmm15[12],xmm8[13],xmm15[13],xmm8[14],xmm15[14],xmm8[15],xmm15[15]
9239 ; SSE-NEXT: pand %xmm10, %xmm8
9240 ; SSE-NEXT: pandn %xmm1, %xmm10
9241 ; SSE-NEXT: por %xmm8, %xmm10
9242 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,1,1,1]
9243 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
9244 ; SSE-NEXT: packuswb %xmm1, %xmm1
9245 ; SSE-NEXT: pand %xmm6, %xmm1
9246 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,3]
9247 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
9248 ; SSE-NEXT: packuswb %xmm8, %xmm8
9249 ; SSE-NEXT: pandn %xmm8, %xmm6
9250 ; SSE-NEXT: por %xmm6, %xmm1
9251 ; SSE-NEXT: andps %xmm0, %xmm14
9252 ; SSE-NEXT: pandn %xmm1, %xmm0
9253 ; SSE-NEXT: por %xmm14, %xmm0
9254 ; SSE-NEXT: movdqa %xmm0, %xmm1
9255 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9256 ; SSE-NEXT: movaps %xmm0, (%rsi)
9257 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9258 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
9259 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9260 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
9261 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9262 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
9263 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9264 ; SSE-NEXT: movaps %xmm0, (%rdx)
9265 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9266 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
9267 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9268 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
9269 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9270 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
9271 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9272 ; SSE-NEXT: movaps %xmm0, (%rcx)
9273 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9274 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
9275 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9276 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
9277 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9278 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
9279 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9280 ; SSE-NEXT: movaps %xmm0, (%r8)
9281 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9282 ; SSE-NEXT: movaps %xmm0, 48(%r8)
9283 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9284 ; SSE-NEXT: movaps %xmm0, 32(%r8)
9285 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9286 ; SSE-NEXT: movaps %xmm0, 16(%r8)
9287 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9288 ; SSE-NEXT: movaps %xmm0, (%r9)
9289 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9290 ; SSE-NEXT: movaps %xmm0, 48(%r9)
9291 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9292 ; SSE-NEXT: movaps %xmm0, 32(%r9)
9293 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9294 ; SSE-NEXT: movaps %xmm0, 16(%r9)
9295 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9296 ; SSE-NEXT: movdqa %xmm11, (%rax)
9297 ; SSE-NEXT: movdqa %xmm9, 48(%rax)
9298 ; SSE-NEXT: movdqa %xmm7, 32(%rax)
9299 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9300 ; SSE-NEXT: movaps %xmm0, 16(%rax)
9301 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9302 ; SSE-NEXT: movdqa %xmm1, (%rax)
9303 ; SSE-NEXT: movdqa %xmm13, 48(%rax)
9304 ; SSE-NEXT: movdqa %xmm12, 32(%rax)
9305 ; SSE-NEXT: movdqa %xmm3, 16(%rax)
9306 ; SSE-NEXT: addq $1528, %rsp # imm = 0x5F8
9309 ; AVX-LABEL: load_i8_stride7_vf64:
9311 ; AVX-NEXT: subq $744, %rsp # imm = 0x2E8
9312 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [128,128,6,13,0,0,0,128,128,128,6,13,0,0,0,128]
9313 ; AVX-NEXT: # xmm0 = mem[0,0]
9314 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
9315 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9316 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm2
9317 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm1
9318 ; AVX-NEXT: vmovdqa %xmm2, %xmm10
9319 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm8
9320 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9321 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0]
9322 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm4
9323 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
9324 ; AVX-NEXT: vmovdqa (%rdi), %xmm5
9325 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9326 ; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm5
9327 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm7
9328 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,0,0,0,3,10,128,128,128,0,0,0,3,10,128]
9329 ; AVX-NEXT: # xmm4 = mem[0,0]
9330 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm5
9331 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9332 ; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm6
9333 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,15,0,0,0,128,128,1,8,15,0,0,0,128,128,1]
9334 ; AVX-NEXT: # xmm5 = mem[0,0]
9335 ; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm8
9336 ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm8
9337 ; AVX-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
9338 ; AVX-NEXT: vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
9339 ; AVX-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9340 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm7
9341 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9342 ; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm2
9343 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm7
9344 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9345 ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3
9346 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9347 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm3
9348 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9349 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
9350 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm4
9351 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9352 ; AVX-NEXT: vpshufb %xmm5, %xmm4, %xmm4
9353 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
9354 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [8,15,128,128,0,0,0,1,8,15,128,128,0,0,0,1]
9355 ; AVX-NEXT: # xmm4 = mem[0,0]
9356 ; AVX-NEXT: vpblendvb %xmm6, %xmm2, %xmm3, %xmm2
9357 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9358 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm3
9359 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm2
9360 ; AVX-NEXT: vmovdqa %xmm3, %xmm7
9361 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm2
9362 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
9363 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm5
9364 ; AVX-NEXT: vpshufb %xmm3, %xmm5, %xmm1
9365 ; AVX-NEXT: vmovdqa %xmm5, %xmm12
9366 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
9367 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm6
9368 ; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm9
9369 ; AVX-NEXT: vmovdqa %xmm6, %xmm14
9370 ; AVX-NEXT: vpor %xmm1, %xmm9, %xmm9
9371 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
9372 ; AVX-NEXT: vpblendvb %xmm15, %xmm2, %xmm9, %xmm2
9373 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9374 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm2
9375 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm0
9376 ; AVX-NEXT: vmovdqa %xmm2, %xmm9
9377 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm6
9378 ; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm2
9379 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
9380 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm8
9381 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2
9382 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm11
9383 ; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm3
9384 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9385 ; AVX-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
9386 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9387 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [9,128,128,128,0,0,0,2,9,128,128,128,0,0,0,2]
9388 ; AVX-NEXT: # xmm0 = mem[0,0]
9389 ; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm3
9390 ; AVX-NEXT: vmovdqa %xmm7, %xmm1
9391 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9392 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [128,0,7,14,0,0,0,128,128,0,7,14,0,0,0,128]
9393 ; AVX-NEXT: # xmm2 = mem[0,0]
9394 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9395 ; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm4
9396 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm5
9397 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9398 ; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm13
9399 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
9401 ; AVX-NEXT: vmovdqa %xmm14, %xmm7
9402 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9403 ; AVX-NEXT: vpshufb %xmm4, %xmm14, %xmm14
9404 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
9405 ; AVX-NEXT: vpblendvb %xmm15, %xmm5, %xmm13, %xmm5
9406 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9407 ; AVX-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill
9408 ; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0
9409 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9410 ; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2
9411 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
9412 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9413 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2
9414 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9415 ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm3
9416 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9417 ; AVX-NEXT: vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
9418 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9419 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0]
9420 ; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm2
9421 ; AVX-NEXT: vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0]
9422 ; AVX-NEXT: vpshufb %xmm12, %xmm7, %xmm3
9423 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm5
9424 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
9425 ; AVX-NEXT: # xmm2 = mem[0,0]
9426 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm4
9427 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [128,1,8,15,0,0,0,128,128,1,8,15,0,0,0,128]
9428 ; AVX-NEXT: # xmm3 = mem[0,0]
9429 ; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm13
9430 ; AVX-NEXT: vpor %xmm4, %xmm13, %xmm13
9431 ; AVX-NEXT: vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
9432 ; AVX-NEXT: vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
9433 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9434 ; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0
9435 ; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm1
9436 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
9437 ; AVX-NEXT: vpshufb %xmm2, %xmm6, %xmm1
9438 ; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2
9439 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
9440 ; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
9441 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9442 ; AVX-NEXT: vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0]
9443 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9444 ; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm0
9445 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
9446 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9447 ; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm3
9448 ; AVX-NEXT: vpor %xmm0, %xmm3, %xmm3
9449 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [9,128,128,2,9,128,128,2,9,128,128,2,9,128,128,2]
9450 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9451 ; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm0
9452 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9453 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9454 ; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm13
9455 ; AVX-NEXT: vpor %xmm0, %xmm13, %xmm13
9456 ; AVX-NEXT: vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
9457 ; AVX-NEXT: vpblendvb %xmm9, %xmm3, %xmm13, %xmm0
9458 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9459 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9460 ; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0
9461 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9462 ; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm2
9463 ; AVX-NEXT: vpor %xmm0, %xmm2, %xmm14
9464 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9465 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2
9466 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9467 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm3
9468 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9469 ; AVX-NEXT: vpblendvb %xmm9, %xmm14, %xmm2, %xmm2
9470 ; AVX-NEXT: vmovdqa %xmm9, %xmm12
9471 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9472 ; AVX-NEXT: vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
9473 ; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm2
9474 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0]
9475 ; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4
9476 ; AVX-NEXT: vpor %xmm2, %xmm4, %xmm2
9477 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
9478 ; AVX-NEXT: vpshufb %xmm4, %xmm15, %xmm5
9479 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm6 = [128,5,12,128,128,5,12,128,128,5,12,128,128,5,12,128]
9480 ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm14
9481 ; AVX-NEXT: vpor %xmm5, %xmm14, %xmm5
9482 ; AVX-NEXT: vmovdqa %xmm12, %xmm14
9483 ; AVX-NEXT: vpblendvb %xmm12, %xmm2, %xmm5, %xmm2
9484 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9485 ; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm5
9486 ; AVX-NEXT: vmovdqa %xmm13, %xmm9
9487 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm2
9488 ; AVX-NEXT: vmovdqa %xmm1, %xmm12
9489 ; AVX-NEXT: vpor %xmm5, %xmm2, %xmm1
9490 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2
9491 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm3
9492 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9493 ; AVX-NEXT: vpblendvb %xmm14, %xmm1, %xmm2, %xmm1
9494 ; AVX-NEXT: vmovdqa %xmm14, %xmm6
9495 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9496 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
9497 ; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm3
9498 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0]
9499 ; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm4
9500 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm5
9501 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
9502 ; AVX-NEXT: vpshufb %xmm3, %xmm15, %xmm13
9503 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,6,13,128,128,6,13,128,128,6,13,128,128,6,13,128]
9504 ; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm14
9505 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
9506 ; AVX-NEXT: vpblendvb %xmm6, %xmm5, %xmm13, %xmm5
9507 ; AVX-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9508 ; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm1
9509 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2
9510 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
9511 ; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm2
9512 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm3
9513 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
9514 ; AVX-NEXT: vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
9515 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9516 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0]
9517 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9518 ; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm1
9519 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0]
9520 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9521 ; AVX-NEXT: vpshufb %xmm3, %xmm14, %xmm2
9522 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
9523 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,2,9,128,128,2,9,128,128,2,9,128,128,2,9,128]
9524 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9525 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm2
9526 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
9527 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9528 ; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm13
9529 ; AVX-NEXT: vpor %xmm2, %xmm13, %xmm13
9530 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
9531 ; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
9532 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9533 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9534 ; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0
9535 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9536 ; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm1
9537 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
9538 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9539 ; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm1
9540 ; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
9541 ; AVX-NEXT: vpshufb %xmm5, %xmm6, %xmm3
9542 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
9543 ; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
9544 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9545 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0]
9546 ; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm0
9547 ; AVX-NEXT: vmovdqa %xmm14, %xmm15
9548 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0]
9549 ; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm4
9550 ; AVX-NEXT: vpor %xmm0, %xmm4, %xmm5
9551 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
9552 ; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm13
9553 ; AVX-NEXT: vmovdqa %xmm8, %xmm11
9554 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [12,128,128,5,12,128,128,5,12,128,128,5,12,128,128,5]
9555 ; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm14
9556 ; AVX-NEXT: vpor %xmm13, %xmm14, %xmm13
9557 ; AVX-NEXT: vmovdqa %xmm2, %xmm8
9558 ; AVX-NEXT: vpblendvb %xmm2, %xmm5, %xmm13, %xmm2
9559 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9560 ; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm1
9561 ; AVX-NEXT: vmovdqa %xmm10, %xmm2
9562 ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3
9563 ; AVX-NEXT: vmovdqa %xmm7, %xmm10
9564 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
9565 ; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm3
9566 ; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0
9567 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
9568 ; AVX-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm14
9569 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0]
9570 ; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm0
9571 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0]
9572 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9573 ; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm4
9574 ; AVX-NEXT: vpor %xmm0, %xmm4, %xmm4
9575 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm13 = [128,4,11,128,128,4,11,128,128,4,11,128,128,4,11,128]
9576 ; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm5
9577 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [13,128,128,6,13,128,128,6,13,128,128,6,13,128,128,6]
9578 ; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm11
9579 ; AVX-NEXT: vpor %xmm5, %xmm11, %xmm5
9580 ; AVX-NEXT: vmovdqa %xmm8, %xmm11
9581 ; AVX-NEXT: vpblendvb %xmm8, %xmm4, %xmm5, %xmm8
9582 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
9583 ; AVX-NEXT: vpshufb %xmm3, %xmm10, %xmm3
9584 ; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
9585 ; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm3
9586 ; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm0
9587 ; AVX-NEXT: vmovdqa %xmm6, %xmm9
9588 ; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
9589 ; AVX-NEXT: vpblendvb %xmm11, %xmm1, %xmm0, %xmm2
9590 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
9591 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm0
9592 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
9593 ; AVX-NEXT: # xmm3 = mem[0,0]
9594 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4
9595 ; AVX-NEXT: vmovdqa %xmm1, %xmm6
9596 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
9597 ; AVX-NEXT: # xmm11 = mem[0,0]
9598 ; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm13
9599 ; AVX-NEXT: vmovdqa %xmm0, %xmm5
9600 ; AVX-NEXT: vpor %xmm4, %xmm13, %xmm4
9601 ; AVX-NEXT: vpmovsxdq {{.*#+}} xmm13 = [18446744073709486080,16777215]
9602 ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
9603 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9604 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm1
9605 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm0
9606 ; AVX-NEXT: vmovdqa %xmm1, %xmm4
9607 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3
9608 ; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm11
9609 ; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0
9610 ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9611 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9612 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
9613 ; AVX-NEXT: # xmm1 = mem[0,0]
9614 ; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm11
9615 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
9616 ; AVX-NEXT: # xmm0 = mem[0,0]
9617 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm15
9618 ; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11
9619 ; AVX-NEXT: vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload
9620 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9621 ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
9622 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
9623 ; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
9624 ; AVX-NEXT: vpblendvb %xmm13, %xmm14, %xmm0, %xmm0
9625 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9626 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
9627 ; AVX-NEXT: # xmm0 = mem[0,0]
9628 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm1
9629 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
9630 ; AVX-NEXT: # xmm11 = mem[0,0]
9631 ; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm14
9632 ; AVX-NEXT: vpor %xmm1, %xmm14, %xmm1
9633 ; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm1, %xmm1
9634 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9635 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
9636 ; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm1
9637 ; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
9638 ; AVX-NEXT: vpblendvb %xmm13, %xmm2, %xmm0, %xmm0
9639 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9640 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9641 ; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm1
9642 ; AVX-NEXT: vmovdqa %xmm0, %xmm14
9643 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0]
9644 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9645 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
9646 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
9647 ; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [128,5,12,0,0,0,128,128,128,5,12,0,0,0,128,128]
9648 ; AVX-NEXT: # xmm1 = mem[0,0]
9649 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9650 ; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm8
9651 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [14,128,128,0,0,0,0,7,14,128,128,0,0,0,0,7]
9652 ; AVX-NEXT: # xmm2 = mem[0,0]
9653 ; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm10
9654 ; AVX-NEXT: vpor %xmm8, %xmm10, %xmm8
9655 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm7[1,2],xmm8[3,4,5,6,7]
9656 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
9657 ; AVX-NEXT: # xmm7 = mem[0,0]
9658 ; AVX-NEXT: vpshufb %xmm7, %xmm5, %xmm10
9659 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9660 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
9661 ; AVX-NEXT: # xmm11 = mem[0,0]
9662 ; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm12
9663 ; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
9664 ; AVX-NEXT: vpblendvb %xmm13, %xmm8, %xmm10, %xmm8
9665 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9666 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9667 ; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0
9668 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9669 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm8
9670 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
9671 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9672 ; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm1
9673 ; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2
9674 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
9675 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7]
9676 ; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm1
9677 ; AVX-NEXT: vmovdqa %xmm4, %xmm8
9678 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9679 ; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm2
9680 ; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
9681 ; AVX-NEXT: vpblendvb %xmm13, %xmm0, %xmm1, %xmm0
9682 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
9683 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,2,9,0,0,2,9,0,0,2,9,0,0,2,9]
9684 ; AVX-NEXT: vmovdqa %xmm6, %xmm13
9685 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9686 ; AVX-NEXT: vpshufb %xmm14, %xmm6, %xmm1
9687 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
9688 ; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm2
9689 ; AVX-NEXT: vmovdqa %xmm0, %xmm5
9690 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9691 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9692 ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9693 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm15 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
9694 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
9695 ; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm7
9696 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9697 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
9698 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
9699 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9700 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm10
9701 ; AVX-NEXT: vmovdqa %xmm2, %xmm6
9702 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
9703 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm0
9704 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9705 ; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9]
9706 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
9707 ; AVX-NEXT: # xmm11 = mem[0,0]
9708 ; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm10
9709 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
9710 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
9711 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9712 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm12
9713 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
9714 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
9715 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm12 # 32-byte Folded Reload
9716 ; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10
9717 ; AVX-NEXT: vorps %ymm10, %ymm12, %ymm10
9718 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9719 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
9720 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
9721 ; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10
9722 ; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1
9723 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9724 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm1
9725 ; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm10
9726 ; AVX-NEXT: vmovdqa %xmm3, %xmm5
9727 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9728 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
9729 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9730 ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9731 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm3
9732 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9733 ; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm14
9734 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
9735 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9736 ; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm15
9737 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
9738 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm9
9739 ; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7,8,9]
9740 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9741 ; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm11
9742 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm8
9743 ; AVX-NEXT: vpshufb %xmm0, %xmm8, %xmm0
9744 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm0
9746 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload
9747 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
9748 ; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0
9749 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9750 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
9751 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
9752 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9753 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9754 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
9755 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9756 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
9757 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
9758 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9759 ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6,7]
9760 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
9761 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [0,0,7,14,0,0,7,14,0,0,7,14,0,0,7,14]
9762 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9763 ; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm10
9764 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
9765 ; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
9766 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6],xmm0[7]
9767 ; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [0,128,128,128,128,128,3,10,0,128,128,128,128,128,3,10]
9768 ; AVX-NEXT: # xmm2 = mem[0,0]
9769 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9770 ; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm11
9771 ; AVX-NEXT: vpor %xmm11, %xmm10, %xmm10
9772 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9773 ; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
9774 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm10, %ymm10
9775 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9776 ; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
9777 ; AVX-NEXT: vandps %ymm14, %ymm10, %ymm10
9778 ; AVX-NEXT: vorps %ymm0, %ymm10, %ymm0
9779 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9780 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
9781 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
9782 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9783 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9784 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9785 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
9786 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
9787 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9788 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9789 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
9790 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9791 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
9792 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9793 ; AVX-NEXT: vpshufb %xmm7, %xmm11, %xmm7
9794 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
9795 ; AVX-NEXT: vpxor %xmm7, %xmm7, %xmm7
9796 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7]
9797 ; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm2
9798 ; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
9799 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[1,8,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
9800 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
9801 ; AVX-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload
9802 ; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1
9803 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
9804 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9805 ; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
9806 ; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
9807 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
9808 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9809 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
9810 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9811 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
9812 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,0,6,13,0,0,6,13,0,0,6,13,0,0,6,13]
9813 ; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm1
9814 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9815 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9816 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
9817 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9818 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm10[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
9819 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,1,8,15,0,1,8,15,0,1,8,15,0,1,8,15]
9820 ; AVX-NEXT: vpshufb %xmm14, %xmm15, %xmm6
9821 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
9822 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm7[7]
9823 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [0,128,128,128,128,128,4,11,0,128,128,128,128,128,4,11]
9824 ; AVX-NEXT: # xmm6 = mem[0,0]
9825 ; AVX-NEXT: vmovdqa %xmm4, %xmm12
9826 ; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm7
9827 ; AVX-NEXT: vpor %xmm7, %xmm1, %xmm1
9828 ; AVX-NEXT: vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9829 ; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm7
9830 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
9831 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9832 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
9833 ; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
9834 ; AVX-NEXT: vorps %ymm7, %ymm8, %ymm7
9835 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm8
9836 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
9837 ; AVX-NEXT: vandnps %ymm8, %ymm15, %ymm8
9838 ; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7
9839 ; AVX-NEXT: vorps %ymm7, %ymm8, %ymm0
9840 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9841 ; AVX-NEXT: vpshufb %xmm2, %xmm13, %xmm4
9842 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9843 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
9844 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9845 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9846 ; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm3[6,7]
9847 ; AVX-NEXT: vmovdqa %xmm5, %xmm0
9848 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,6,13,u,u]
9849 ; AVX-NEXT: vmovdqa %xmm11, %xmm1
9850 ; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm4
9851 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
9852 ; AVX-NEXT: vxorps %xmm7, %xmm7, %xmm7
9853 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7]
9854 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9855 ; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm4
9856 ; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2
9857 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9858 ; AVX-NEXT: vpshufb %xmm9, %xmm14, %xmm4
9859 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
9860 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9861 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
9862 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
9863 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2
9864 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9865 ; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3
9866 ; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2
9867 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
9868 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9869 ; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [0,2,9,128,128,128,0,0,0,2,9,128,128,128,0,0]
9870 ; AVX-NEXT: # xmm3 = mem[0,0]
9871 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9872 ; AVX-NEXT: vpshufb %xmm3, %xmm9, %xmm2
9873 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [0,128,128,0,7,14,0,0,0,128,128,0,7,14,0,0]
9874 ; AVX-NEXT: # xmm4 = mem[0,0]
9875 ; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm5
9876 ; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2
9877 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm7[7]
9878 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,128,128,128,5,12,0,128,128,128,128,128,5,12]
9879 ; AVX-NEXT: # xmm5 = mem[0,0]
9880 ; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6
9881 ; AVX-NEXT: vmovdqa %xmm12, %xmm11
9882 ; AVX-NEXT: vpor %xmm6, %xmm2, %xmm6
9883 ; AVX-NEXT: vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9884 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9885 ; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm7
9886 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9887 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9888 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
9889 ; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6
9890 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
9891 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
9892 ; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7
9893 ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm6
9894 ; AVX-NEXT: vorps %ymm7, %ymm6, %ymm6
9895 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9896 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
9897 ; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm4
9898 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
9899 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],mem[7]
9900 ; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4
9901 ; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
9902 ; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm4
9903 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
9904 ; AVX-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload
9905 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
9906 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
9907 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9908 ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4
9909 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
9910 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm0
9911 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9912 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9913 ; AVX-NEXT: vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9914 ; AVX-NEXT: vpshufb %xmm14, %xmm8, %xmm3
9915 ; AVX-NEXT: vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9916 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9917 ; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm4
9918 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9919 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [12,0,0,0,128,128,128,5,12,0,0,0,128,128,128,5]
9920 ; AVX-NEXT: # xmm4 = mem[0,0]
9921 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9922 ; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm5
9923 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,0,7,14,128,128,0,0,0,0,7,14,128]
9924 ; AVX-NEXT: # xmm6 = mem[0,0]
9925 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9926 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7
9927 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5
9928 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4,5,6,7]
9929 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[u,u]
9930 ; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [0,128,128,1,8,15,0,0,0,128,128,1,8,15,0,0]
9931 ; AVX-NEXT: # xmm5 = mem[0,0]
9932 ; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm9
9933 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
9934 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],mem[7]
9935 ; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,128,128,128,128,128,6,13,0,128,128,128,128,128,6,13]
9936 ; AVX-NEXT: # xmm9 = mem[0,0]
9937 ; AVX-NEXT: vpshufb %xmm9, %xmm11, %xmm10
9938 ; AVX-NEXT: vpor %xmm7, %xmm10, %xmm7
9939 ; AVX-NEXT: vmovdqa %xmm0, %xmm11
9940 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm10
9941 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
9942 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9943 ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
9944 ; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
9945 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm3
9946 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 16-byte Folded Reload
9947 ; AVX-NEXT: vandnps %ymm7, %ymm15, %ymm7
9948 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
9949 ; AVX-NEXT: vorps %ymm7, %ymm3, %ymm0
9950 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9951 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9952 ; AVX-NEXT: vpshufb %xmm14, %xmm10, %xmm3
9953 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9954 ; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm7
9955 ; AVX-NEXT: vmovdqa %xmm11, %xmm2
9956 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
9957 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9958 ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4
9959 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9960 ; AVX-NEXT: vpshufb %xmm6, %xmm7, %xmm6
9961 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
9962 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3,4,5,6,7]
9963 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9964 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[u,u]
9965 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9966 ; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm5
9967 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
9968 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],mem[7]
9969 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9970 ; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5
9971 ; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
9972 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9973 ; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
9974 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9975 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
9976 ; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3
9977 ; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm4
9978 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
9979 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
9980 ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4
9981 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
9982 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
9983 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9984 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9985 ; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3
9986 ; AVX-NEXT: vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
9987 ; AVX-NEXT: vpshufb %xmm5, %xmm13, %xmm4
9988 ; AVX-NEXT: vmovdqa %xmm5, %xmm8
9989 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9990 ; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [13,0,0,0,128,128,128,6,13,0,0,0,128,128,128,6]
9991 ; AVX-NEXT: # xmm4 = mem[0,0]
9992 ; AVX-NEXT: vpshufb %xmm4, %xmm12, %xmm5
9993 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,1,8,15,128,128,0,0,0,1,8,15,128]
9994 ; AVX-NEXT: # xmm6 = mem[0,0]
9995 ; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm7
9996 ; AVX-NEXT: vpor %xmm5, %xmm7, %xmm5
9997 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
9998 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9999 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u],zero,zero,xmm1[2,9,u,u,u]
10000 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10001 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm12[u,u,u]
10002 ; AVX-NEXT: vpor %xmm3, %xmm9, %xmm9
10003 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero
10004 ; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,128,128,128,128,0,7,14,0,128,128,128,128,0,7,14]
10005 ; AVX-NEXT: # xmm7 = mem[0,0]
10006 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10007 ; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm13
10008 ; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9
10009 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10010 ; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm13
10011 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9
10012 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
10013 ; AVX-NEXT: vandps %ymm1, %ymm5, %ymm5
10014 ; AVX-NEXT: vandnps %ymm9, %ymm1, %ymm9
10015 ; AVX-NEXT: vmovaps %ymm1, %ymm13
10016 ; AVX-NEXT: vorps %ymm5, %ymm9, %ymm5
10017 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload
10018 ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9
10019 ; AVX-NEXT: vandps %ymm5, %ymm15, %ymm5
10020 ; AVX-NEXT: vorps %ymm5, %ymm9, %ymm1
10021 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10022 ; AVX-NEXT: vpshufb %xmm2, %xmm10, %xmm2
10023 ; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm9
10024 ; AVX-NEXT: vmovdqa %xmm8, %xmm14
10025 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
10026 ; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm4
10027 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10028 ; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6
10029 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
10030 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3,4,5,6,7]
10031 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9,u,u,u]
10032 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10033 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[u,u,u]
10034 ; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
10035 ; AVX-NEXT: vmovddup {{.*#+}} xmm11 = [0,9,10,11,12,128,128,128,0,9,10,11,12,128,128,128]
10036 ; AVX-NEXT: # xmm11 = mem[0,0]
10037 ; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm4
10038 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10039 ; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm6
10040 ; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4
10041 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10042 ; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm6
10043 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
10044 ; AVX-NEXT: vandps %ymm2, %ymm13, %ymm2
10045 ; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4
10046 ; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2
10047 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
10048 ; AVX-NEXT: vandnps %ymm4, %ymm15, %ymm4
10049 ; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2
10050 ; AVX-NEXT: vorps %ymm4, %ymm2, %ymm0
10051 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10052 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10053 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
10054 ; AVX-NEXT: vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
10055 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10056 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm6
10057 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
10058 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [128,0,0,0,2,9,128,128,128,0,0,0,2,9,128,128]
10059 ; AVX-NEXT: # xmm6 = mem[0,0]
10060 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10061 ; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm7
10062 ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [14,0,0,0,128,128,0,7,14,0,0,0,128,128,0,7]
10063 ; AVX-NEXT: # xmm0 = mem[0,0]
10064 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10065 ; AVX-NEXT: vpshufb %xmm0, %xmm9, %xmm9
10066 ; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
10067 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3,4,5,6,7]
10068 ; AVX-NEXT: vbroadcastss {{.*#+}} xmm7 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
10069 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10070 ; AVX-NEXT: vpshufb %xmm7, %xmm9, %xmm9
10071 ; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm12[u,u,u]
10072 ; AVX-NEXT: vpor %xmm9, %xmm13, %xmm9
10073 ; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9
10074 ; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,128,128,128,128,1,8,15,0,128,128,128,128,1,8,15]
10075 ; AVX-NEXT: # xmm13 = mem[0,0]
10076 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10077 ; AVX-NEXT: vpshufb %xmm13, %xmm11, %xmm14
10078 ; AVX-NEXT: vpor %xmm14, %xmm9, %xmm9
10079 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm14
10080 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9
10081 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
10082 ; AVX-NEXT: vandps %ymm4, %ymm11, %ymm4
10083 ; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9
10084 ; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4
10085 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 16-byte Folded Reload
10086 ; AVX-NEXT: vandnps %ymm9, %ymm15, %ymm9
10087 ; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4
10088 ; AVX-NEXT: vorps %ymm4, %ymm9, %ymm4
10089 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10090 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
10091 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10092 ; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm12
10093 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
10094 ; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm6
10095 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10096 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm8
10097 ; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6
10098 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3,4,5,6,7]
10099 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10100 ; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm7
10101 ; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm10[u,u,u]
10102 ; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
10103 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,9,10,11,12],zero,zero,zero
10104 ; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm7
10105 ; AVX-NEXT: vpor %xmm7, %xmm3, %xmm3
10106 ; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2
10107 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
10108 ; AVX-NEXT: vandps %ymm6, %ymm11, %ymm3
10109 ; AVX-NEXT: vandnps %ymm2, %ymm11, %ymm1
10110 ; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1
10111 ; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm2 # 16-byte Folded Reload
10112 ; AVX-NEXT: vandnps %ymm2, %ymm15, %ymm2
10113 ; AVX-NEXT: vandps %ymm1, %ymm15, %ymm0
10114 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
10115 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10116 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
10117 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10118 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
10119 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10120 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
10121 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10122 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
10123 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10124 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
10125 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10126 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
10127 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10128 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
10129 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10130 ; AVX-NEXT: vmovaps %ymm1, (%r8)
10131 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10132 ; AVX-NEXT: vmovaps %ymm1, 32(%r9)
10133 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10134 ; AVX-NEXT: vmovaps %ymm1, (%r9)
10135 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10136 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10137 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
10138 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10139 ; AVX-NEXT: vmovaps %ymm1, (%rax)
10140 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10141 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
10142 ; AVX-NEXT: vmovaps %ymm4, (%rax)
10143 ; AVX-NEXT: addq $744, %rsp # imm = 0x2E8
10144 ; AVX-NEXT: vzeroupper
10147 ; AVX2-LABEL: load_i8_stride7_vf64:
10149 ; AVX2-NEXT: subq $760, %rsp # imm = 0x2F8
10150 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6
10151 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7
10152 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm8
10153 ; AVX2-NEXT: vmovdqa (%rdi), %ymm1
10154 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
10155 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm4
10156 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5
10157 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10158 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
10159 ; AVX2-NEXT: vmovdqa %ymm3, %ymm13
10160 ; AVX2-NEXT: vmovdqa %ymm2, %ymm10
10161 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10162 ; AVX2-NEXT: vmovdqa %ymm1, %ymm12
10163 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10164 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
10165 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
10166 ; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm3
10167 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
10168 ; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
10169 ; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
10170 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10171 ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
10172 ; AVX2-NEXT: vmovdqa %ymm9, %ymm14
10173 ; AVX2-NEXT: vmovdqa %ymm5, %ymm9
10174 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10175 ; AVX2-NEXT: vmovdqa %ymm4, %ymm11
10176 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10177 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
10178 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
10179 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
10180 ; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
10181 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm5
10182 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
10183 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
10184 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10185 ; AVX2-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
10186 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10187 ; AVX2-NEXT: vmovdqa %ymm7, %ymm0
10188 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10189 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
10190 ; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1
10191 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7
10192 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm2
10193 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
10194 ; AVX2-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2
10195 ; AVX2-NEXT: vmovdqa %ymm7, %ymm15
10196 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10197 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10198 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
10199 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
10200 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
10201 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
10202 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10203 ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
10204 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
10205 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
10206 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10207 ; AVX2-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5
10208 ; AVX2-NEXT: vmovdqa %ymm2, %ymm1
10209 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm3
10210 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
10211 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm7
10212 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
10213 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
10214 ; AVX2-NEXT: vpor %xmm7, %xmm5, %xmm5
10215 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
10216 ; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
10217 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
10218 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0]
10219 ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
10220 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10221 ; AVX2-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
10222 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
10223 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15]
10224 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
10225 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5
10226 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
10227 ; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2
10228 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3
10229 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
10230 ; AVX2-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0
10231 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10232 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9
10233 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3
10234 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10235 ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2
10236 ; AVX2-NEXT: vmovdqa %ymm3, %ymm15
10237 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10238 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
10239 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
10240 ; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3
10241 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
10242 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm2
10243 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
10244 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
10245 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm5
10246 ; AVX2-NEXT: vpshufb %xmm12, %xmm5, %xmm3
10247 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10248 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1
10249 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm10
10250 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10251 ; AVX2-NEXT: vmovdqa %xmm6, %xmm11
10252 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
10253 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10254 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
10255 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10256 ; AVX2-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0]
10257 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10258 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10259 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2
10260 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm3
10261 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0
10262 ; AVX2-NEXT: vmovdqa %ymm2, %ymm4
10263 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
10264 ; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm6
10265 ; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
10266 ; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0
10267 ; AVX2-NEXT: vmovdqa 432(%rdi), %xmm13
10268 ; AVX2-NEXT: vpshufb %xmm12, %xmm13, %xmm6
10269 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm2
10270 ; AVX2-NEXT: vpshufb %xmm11, %xmm2, %xmm8
10271 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
10272 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10273 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10274 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
10275 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10276 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10277 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10278 ; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
10279 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
10280 ; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm6
10281 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
10282 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
10283 ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0
10284 ; AVX2-NEXT: vpor %xmm6, %xmm0, %xmm0
10285 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
10286 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm9
10287 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
10288 ; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm11
10289 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10290 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10291 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10292 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
10293 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10294 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10295 ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0
10296 ; AVX2-NEXT: vmovdqa %ymm3, %ymm11
10297 ; AVX2-NEXT: vpshufb %xmm10, %xmm0, %xmm1
10298 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
10299 ; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0
10300 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
10301 ; AVX2-NEXT: vpshufb %xmm6, %xmm13, %xmm1
10302 ; AVX2-NEXT: vpshufb %xmm12, %xmm2, %xmm6
10303 ; AVX2-NEXT: vmovdqa %xmm2, %xmm12
10304 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
10305 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10306 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10307 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10308 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10309 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10310 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10311 ; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
10312 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10313 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
10314 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
10315 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10316 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
10317 ; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
10318 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
10319 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
10320 ; AVX2-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10321 ; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm8
10322 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
10323 ; AVX2-NEXT: vpshufb %xmm9, %xmm12, %xmm10
10324 ; AVX2-NEXT: vmovdqa %xmm12, %xmm3
10325 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10326 ; AVX2-NEXT: vpor %xmm8, %xmm10, %xmm8
10327 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
10328 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
10329 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
10330 ; AVX2-NEXT: # ymm0 = mem[0,1,0,1]
10331 ; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8
10332 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10333 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10334 ; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8
10335 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10
10336 ; AVX2-NEXT: vpshufb %xmm6, %xmm10, %xmm6
10337 ; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm7
10338 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
10339 ; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm1
10340 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10341 ; AVX2-NEXT: vpshufb %xmm9, %xmm14, %xmm7
10342 ; AVX2-NEXT: vpor %xmm1, %xmm7, %xmm1
10343 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10344 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10345 ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
10346 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10347 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10348 ; AVX2-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
10349 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
10350 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm7
10351 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
10352 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
10353 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10354 ; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1
10355 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
10356 ; AVX2-NEXT: vpshufb %xmm7, %xmm13, %xmm9
10357 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
10358 ; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11
10359 ; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9
10360 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10361 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10362 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10363 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10364 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10365 ; AVX2-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1
10366 ; AVX2-NEXT: vmovdqa %ymm12, %ymm2
10367 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm6
10368 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
10369 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10370 ; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1
10371 ; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm6
10372 ; AVX2-NEXT: vmovdqa %xmm5, %xmm13
10373 ; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm7
10374 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
10375 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10376 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10377 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10378 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10379 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10380 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10381 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
10382 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
10383 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm7
10384 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
10385 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
10386 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10387 ; AVX2-NEXT: vpor %xmm7, %xmm1, %xmm1
10388 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
10389 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10390 ; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm9
10391 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
10392 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10393 ; AVX2-NEXT: vpshufb %xmm10, %xmm14, %xmm11
10394 ; AVX2-NEXT: vpor %xmm9, %xmm11, %xmm9
10395 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10396 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10397 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10398 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10399 ; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1
10400 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm6
10401 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
10402 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10403 ; AVX2-NEXT: vpor %xmm6, %xmm1, %xmm1
10404 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10405 ; AVX2-NEXT: vpshufb %xmm7, %xmm11, %xmm6
10406 ; AVX2-NEXT: vpshufb %xmm10, %xmm13, %xmm7
10407 ; AVX2-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill
10408 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
10409 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10410 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10411 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10412 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10413 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10414 ; AVX2-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
10415 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10416 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10417 ; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
10418 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
10419 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
10420 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
10421 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
10422 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10423 ; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
10424 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
10425 ; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm7
10426 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
10427 ; AVX2-NEXT: vpshufb %xmm8, %xmm14, %xmm9
10428 ; AVX2-NEXT: vpor %xmm7, %xmm9, %xmm7
10429 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10430 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10431 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1
10432 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10433 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10434 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10435 ; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5
10436 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10437 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
10438 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
10439 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10440 ; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10441 ; AVX2-NEXT: vpor %xmm3, %xmm1, %xmm1
10442 ; AVX2-NEXT: vpshufb %xmm2, %xmm11, %xmm2
10443 ; AVX2-NEXT: vpshufb %xmm8, %xmm13, %xmm3
10444 ; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
10445 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10446 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10447 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
10448 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10449 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10450 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10451 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10452 ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
10453 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10454 ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
10455 ; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
10456 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10457 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
10458 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10459 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10460 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
10461 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10462 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10463 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10464 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3
10465 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10466 ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3
10467 ; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14
10468 ; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4
10469 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10470 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
10471 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10472 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10473 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
10474 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10475 ; AVX2-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
10476 ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
10477 ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
10478 ; AVX2-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8
10479 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10480 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2
10481 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10482 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10483 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10484 ; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2
10485 ; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6
10486 ; AVX2-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
10487 ; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
10488 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10489 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10490 ; AVX2-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
10491 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
10492 ; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm15
10493 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
10494 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
10495 ; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
10496 ; AVX2-NEXT: vpor %xmm1, %xmm15, %xmm1
10497 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm15
10498 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15]
10499 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
10500 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
10501 ; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
10502 ; AVX2-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255]
10503 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0
10504 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10505 ; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm0
10506 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1
10507 ; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
10508 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
10509 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
10510 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
10511 ; AVX2-NEXT: vpshufb %ymm5, %ymm1, %ymm1
10512 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
10513 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10514 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
10515 ; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm1
10516 ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm2
10517 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
10518 ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
10519 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
10520 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2
10521 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
10522 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
10523 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
10524 ; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
10525 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12
10526 ; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0
10527 ; AVX2-NEXT: vextracti128 $1, %ymm14, %xmm1
10528 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10529 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
10530 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm1
10531 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
10532 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
10533 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
10534 ; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm0
10535 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
10536 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
10537 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
10538 ; AVX2-NEXT: vpshufb %xmm2, %xmm13, %xmm3
10539 ; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0
10540 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm3
10541 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15]
10542 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
10543 ; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
10544 ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
10545 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6
10546 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10547 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0
10548 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
10549 ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1
10550 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
10551 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm1
10552 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15]
10553 ; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
10554 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
10555 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10556 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1
10557 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
10558 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10559 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
10560 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
10561 ; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
10562 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10563 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2
10564 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
10565 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
10566 ; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
10567 ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
10568 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2
10569 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
10570 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm1
10571 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10572 ; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm3
10573 ; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
10574 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10575 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm3
10576 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
10577 ; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3
10578 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
10579 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
10580 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10581 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm3
10582 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
10583 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
10584 ; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5
10585 ; AVX2-NEXT: vpor %xmm3, %xmm5, %xmm3
10586 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10587 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm5
10588 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15]
10589 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
10590 ; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
10591 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
10592 ; AVX2-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3
10593 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10594 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4
10595 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
10596 ; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5
10597 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
10598 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm5
10599 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15]
10600 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5
10601 ; AVX2-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
10602 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10603 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm5
10604 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
10605 ; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm5
10606 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
10607 ; AVX2-NEXT: vpshufb %xmm8, %xmm9, %xmm9
10608 ; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5
10609 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10610 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10611 ; AVX2-NEXT: vpshufb %xmm13, %xmm9, %xmm9
10612 ; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
10613 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10614 ; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm11
10615 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10616 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
10617 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10618 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
10619 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
10620 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
10621 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
10622 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10623 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm9
10624 ; AVX2-NEXT: vpshufb %xmm7, %xmm9, %xmm7
10625 ; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm8
10626 ; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
10627 ; AVX2-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
10628 ; AVX2-NEXT: vpshufb %xmm13, %xmm8, %xmm8
10629 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10630 ; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9
10631 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
10632 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10633 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
10634 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
10635 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
10636 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
10637 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
10638 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
10639 ; AVX2-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
10640 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
10641 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
10642 ; AVX2-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
10643 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
10644 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
10645 ; AVX2-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15]
10646 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
10647 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
10648 ; AVX2-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
10649 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
10650 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
10651 ; AVX2-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
10652 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
10653 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
10654 ; AVX2-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
10655 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
10656 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
10657 ; AVX2-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
10658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
10659 ; AVX2-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload
10660 ; AVX2-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
10661 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
10662 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10663 ; AVX2-NEXT: vmovaps %ymm10, 32(%rsi)
10664 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10665 ; AVX2-NEXT: vmovaps %ymm10, (%rsi)
10666 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10667 ; AVX2-NEXT: vmovaps %ymm10, 32(%rdx)
10668 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10669 ; AVX2-NEXT: vmovaps %ymm10, (%rdx)
10670 ; AVX2-NEXT: vmovdqa %ymm5, 32(%rcx)
10671 ; AVX2-NEXT: vmovdqa %ymm7, (%rcx)
10672 ; AVX2-NEXT: vmovdqa %ymm8, 32(%r8)
10673 ; AVX2-NEXT: vmovdqa %ymm9, (%r8)
10674 ; AVX2-NEXT: vmovdqa %ymm6, 32(%r9)
10675 ; AVX2-NEXT: vmovdqa %ymm0, (%r9)
10676 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10677 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rax)
10678 ; AVX2-NEXT: vmovdqa %ymm1, (%rax)
10679 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10680 ; AVX2-NEXT: vmovdqa %ymm3, 32(%rax)
10681 ; AVX2-NEXT: vmovdqa %ymm4, (%rax)
10682 ; AVX2-NEXT: addq $760, %rsp # imm = 0x2F8
10683 ; AVX2-NEXT: vzeroupper
10686 ; AVX2-FP-LABEL: load_i8_stride7_vf64:
10687 ; AVX2-FP: # %bb.0:
10688 ; AVX2-FP-NEXT: subq $760, %rsp # imm = 0x2F8
10689 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6
10690 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm7
10691 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm8
10692 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
10693 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2
10694 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm4
10695 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5
10696 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10697 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
10698 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm13
10699 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm10
10700 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10701 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm12
10702 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10703 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
10704 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
10705 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3
10706 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
10707 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
10708 ; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0
10709 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10710 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
10711 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm14
10712 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9
10713 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10714 ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11
10715 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
10717 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
10718 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
10719 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
10720 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm4, %ymm5
10721 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
10722 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
10723 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10724 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm8, %ymm5
10725 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10726 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm0
10727 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10728 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
10729 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1
10730 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm7
10731 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
10732 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
10733 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2
10734 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm15
10735 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10736 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10737 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5
10738 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
10739 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
10740 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
10741 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10742 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
10743 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
10744 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
10745 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10746 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm12, %ymm10, %ymm5
10747 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1
10748 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm3
10749 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
10750 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm7
10751 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
10752 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
10753 ; AVX2-FP-NEXT: vpor %xmm7, %xmm5, %xmm5
10754 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
10755 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,0,1]
10756 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
10757 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [0,18446744073709551360,16777215,0]
10758 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
10759 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10760 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
10761 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
10762 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7,8,9],ymm5[10],ymm4[11,12,13],ymm5[14],ymm4[15]
10763 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
10764 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm5
10765 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
10766 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm2
10767 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
10768 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
10769 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm2, %ymm0
10770 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10771 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9
10772 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm3
10773 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10774 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm2
10775 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15
10776 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10777 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
10778 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
10779 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
10780 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
10781 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
10782 ; AVX2-FP-NEXT: vpor %xmm3, %xmm2, %xmm2
10783 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
10784 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5
10785 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm3
10786 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
10787 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm1
10788 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm10
10789 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10790 ; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm11
10791 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
10792 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10793 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
10794 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10795 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm14 = [18446744073709551615,18446744073709551615,16777215,0]
10796 ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
10797 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10798 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2
10799 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3
10800 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0
10801 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm4
10802 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm6
10803 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
10804 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
10805 ; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0
10806 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm13
10807 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm13, %xmm6
10808 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm2
10809 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm2, %xmm8
10810 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
10811 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10812 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10813 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
10814 ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10815 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10816 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10817 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
10818 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
10819 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm6
10820 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
10821 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
10822 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
10823 ; AVX2-FP-NEXT: vpor %xmm6, %xmm0, %xmm0
10824 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
10825 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm9
10826 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm12 = [5,12,5,12,5,12,5,12,5,12,5,12,5,12,5,12]
10827 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm11
10828 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
10829 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10830 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10831 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
10832 ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10833 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10834 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0
10835 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm11
10836 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm1
10837 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
10838 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
10839 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
10840 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm13, %xmm1
10841 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm6
10842 ; AVX2-FP-NEXT: vmovdqa %xmm2, %xmm12
10843 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
10844 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
10845 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10846 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
10847 ; AVX2-FP-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10848 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10849 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10850 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
10851 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10852 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
10853 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
10854 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10855 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
10856 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
10857 ; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
10858 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
10859 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10860 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm8
10861 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
10862 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm12, %xmm10
10863 ; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm3
10864 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10865 ; AVX2-FP-NEXT: vpor %xmm8, %xmm10, %xmm8
10866 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
10867 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
10868 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
10869 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1]
10870 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm8, %ymm8
10871 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10872 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10873 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm8
10874 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
10875 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm6
10876 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm7
10877 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
10878 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
10879 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10880 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm14, %xmm7
10881 ; AVX2-FP-NEXT: vpor %xmm1, %xmm7, %xmm1
10882 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10883 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10884 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
10885 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10886 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10887 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
10888 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
10889 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm7
10890 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
10891 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
10892 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10893 ; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1
10894 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
10895 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm9
10896 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
10897 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm11
10898 ; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9
10899 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10900 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10901 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10902 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10903 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10904 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm15, %ymm12, %ymm1
10905 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2
10906 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
10907 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
10908 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10909 ; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1
10910 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm6
10911 ; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm13
10912 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm7
10913 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
10914 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10915 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10916 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10917 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10918 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10919 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10920 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
10921 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
10922 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm7
10923 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
10924 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
10925 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10926 ; AVX2-FP-NEXT: vpor %xmm7, %xmm1, %xmm1
10927 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
10928 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10929 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm9
10930 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
10931 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10932 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm11
10933 ; AVX2-FP-NEXT: vpor %xmm9, %xmm11, %xmm9
10934 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10935 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
10936 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1
10937 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10938 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm15, %ymm2, %ymm1
10939 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
10940 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
10941 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
10942 ; AVX2-FP-NEXT: vpor %xmm6, %xmm1, %xmm1
10943 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10944 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm6
10945 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm7
10946 ; AVX2-FP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill
10947 ; AVX2-FP-NEXT: vpor %xmm6, %xmm7, %xmm6
10948 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10949 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
10950 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10951 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10952 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10953 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
10954 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10955 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
10956 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
10957 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
10958 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
10959 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
10960 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
10961 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10962 ; AVX2-FP-NEXT: vpor %xmm2, %xmm1, %xmm1
10963 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
10964 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm7
10965 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm8 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
10966 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm14, %xmm9
10967 ; AVX2-FP-NEXT: vpor %xmm7, %xmm9, %xmm7
10968 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10969 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
10970 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1
10971 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10972 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10973 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10974 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm5
10975 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10976 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
10977 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
10978 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
10979 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
10980 ; AVX2-FP-NEXT: vpor %xmm3, %xmm1, %xmm1
10981 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
10982 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm13, %xmm3
10983 ; AVX2-FP-NEXT: vpor %xmm2, %xmm3, %xmm2
10984 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
10985 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
10986 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
10987 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10988 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10989 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10990 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10991 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
10992 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
10993 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
10994 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
10995 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
10996 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
10997 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10998 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
10999 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
11000 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11001 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11002 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11003 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm3
11004 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11005 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm2, %ymm3
11006 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm14
11007 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm4
11008 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11009 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm0
11010 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11011 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11012 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11013 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11014 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
11015 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
11016 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
11017 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm6, %ymm8
11018 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11019 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm6, %ymm2
11020 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11021 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11022 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11023 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm2
11024 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm6
11025 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
11026 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
11027 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11028 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11029 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
11030 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
11031 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm1, %xmm15
11032 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
11033 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
11034 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
11035 ; AVX2-FP-NEXT: vpor %xmm1, %xmm15, %xmm1
11036 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm15
11037 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7,8,9,10],ymm15[11],ymm0[12,13],ymm15[14],ymm0[15]
11038 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
11039 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
11040 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
11041 ; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm15 = [18446744073709551615,255]
11042 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm0
11043 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11044 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm3, %xmm0
11045 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm1
11046 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
11047 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0
11048 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1
11049 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
11050 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
11051 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
11052 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11053 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
11054 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm1
11055 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm2
11056 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
11057 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
11058 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
11059 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm2
11060 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
11061 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
11062 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1]
11063 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
11064 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm12
11065 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
11066 ; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm1
11067 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11068 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0
11069 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm1
11070 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4,5,6],ymm1[7,8],ymm6[9,10],ymm1[11],ymm6[12,13,14],ymm1[15]
11071 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
11072 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
11073 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm0
11074 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
11075 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11076 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
11077 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm13, %xmm3
11078 ; AVX2-FP-NEXT: vpor %xmm0, %xmm3, %xmm0
11079 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm3
11080 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2,3],ymm3[4],ymm9[5,6],ymm3[7,8],ymm9[9,10,11],ymm3[12],ymm9[13,14],ymm3[15]
11081 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
11082 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,1]
11083 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11084 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm6
11085 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11086 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0
11087 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11088 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm1
11089 ; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0
11090 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm1
11091 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm10[1,2,3],ymm1[4],ymm10[5,6],ymm1[7,8],ymm10[9,10,11],ymm1[12],ymm10[13,14],ymm1[15]
11092 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
11093 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
11094 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11095 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm1
11096 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
11097 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11098 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
11099 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11100 ; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
11101 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11102 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2
11103 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
11104 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
11105 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
11106 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
11107 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm2
11108 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11109 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm1
11110 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11111 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm3
11112 ; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1
11113 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11114 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm3
11115 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
11116 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
11117 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
11118 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
11119 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11120 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm3
11121 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
11122 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
11123 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
11124 ; AVX2-FP-NEXT: vpor %xmm3, %xmm5, %xmm3
11125 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11126 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm5
11127 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1],ymm8[2,3,4],ymm5[5],ymm8[6,7,8],ymm5[9],ymm8[10,11,12],ymm5[13],ymm8[14,15]
11128 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
11129 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
11130 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
11131 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm3, %ymm5, %ymm3
11132 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11133 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm4
11134 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
11135 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
11136 ; AVX2-FP-NEXT: vpor %xmm4, %xmm5, %xmm4
11137 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm5
11138 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0],ymm5[1],ymm11[2,3,4],ymm5[5],ymm11[6,7,8],ymm5[9],ymm11[10,11,12],ymm5[13],ymm11[14,15]
11139 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
11140 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
11141 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11142 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm5
11143 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
11144 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
11145 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
11146 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm9, %xmm9
11147 ; AVX2-FP-NEXT: vpor %xmm5, %xmm9, %xmm5
11148 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
11149 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11150 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm9, %xmm9
11151 ; AVX2-FP-NEXT: vpbroadcastw {{.*#+}} xmm10 = [6,13,6,13,6,13,6,13,6,13,6,13,6,13,6,13]
11152 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11153 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm11
11154 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
11155 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
11156 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
11157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
11158 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11159 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
11160 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
11161 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11162 ; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm9
11163 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
11164 ; AVX2-FP-NEXT: vpshufb %xmm8, %xmm11, %xmm8
11165 ; AVX2-FP-NEXT: vpor %xmm7, %xmm8, %xmm7
11166 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
11167 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm8, %xmm8
11168 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11169 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
11170 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
11171 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
11172 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
11173 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
11174 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11175 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
11176 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
11177 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
11178 ; AVX2-FP-NEXT: # ymm8 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
11179 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
11180 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload
11181 ; AVX2-FP-NEXT: # ymm9 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
11182 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
11183 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
11184 ; AVX2-FP-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15]
11185 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
11186 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11187 ; AVX2-FP-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
11188 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11189 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
11190 ; AVX2-FP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
11191 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
11192 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11193 ; AVX2-FP-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
11194 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
11195 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
11196 ; AVX2-FP-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
11197 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
11198 ; AVX2-FP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload
11199 ; AVX2-FP-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15]
11200 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
11201 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11202 ; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rsi)
11203 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11204 ; AVX2-FP-NEXT: vmovaps %ymm10, (%rsi)
11205 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11206 ; AVX2-FP-NEXT: vmovaps %ymm10, 32(%rdx)
11207 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11208 ; AVX2-FP-NEXT: vmovaps %ymm10, (%rdx)
11209 ; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rcx)
11210 ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx)
11211 ; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%r8)
11212 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8)
11213 ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9)
11214 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9)
11215 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11216 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax)
11217 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
11218 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11219 ; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rax)
11220 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax)
11221 ; AVX2-FP-NEXT: addq $760, %rsp # imm = 0x2F8
11222 ; AVX2-FP-NEXT: vzeroupper
11223 ; AVX2-FP-NEXT: retq
11225 ; AVX2-FCP-LABEL: load_i8_stride7_vf64:
11226 ; AVX2-FCP: # %bb.0:
11227 ; AVX2-FCP-NEXT: subq $776, %rsp # imm = 0x308
11228 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15
11229 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6
11230 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm10
11231 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
11232 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
11233 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
11234 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
11235 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11236 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm12, %ymm0
11237 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7
11238 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11239 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13
11240 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11241 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
11242 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
11243 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm3
11244 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
11245 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
11246 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
11247 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11248 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm3
11249 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9
11250 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11251 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm11
11252 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11253 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
11254 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
11255 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9,0,7,14,0,0,0,0,0,0,0,6,13,4,11,2,9]
11256 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
11257 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm5
11258 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
11259 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0
11260 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11261 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm10, %ymm5
11262 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11263 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm7
11264 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11265 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
11266 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1
11267 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
11268 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
11269 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
11270 ; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm0
11271 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11272 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm15, %ymm6, %ymm2
11273 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8
11274 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11275 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
11276 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
11277 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
11278 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
11279 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11280 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11281 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm11, %ymm9, %ymm2
11282 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
11283 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
11284 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11285 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm12, %ymm4
11286 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm9
11287 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
11288 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
11289 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
11290 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
11291 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
11292 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
11293 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
11294 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
11295 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
11296 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [0,18446744073709551360,16777215,0]
11297 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
11298 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11299 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm3
11300 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
11301 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7,8,9],ymm8[10],ymm3[11,12,13],ymm8[14],ymm3[15]
11302 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
11303 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm10, %ymm5
11304 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8
11305 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
11306 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
11307 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
11308 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm5
11309 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
11310 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
11311 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11312 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm2
11313 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12
11314 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11315 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm10
11316 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11317 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9
11318 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
11319 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
11320 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
11321 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
11322 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
11323 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
11324 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
11325 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
11326 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,2,0,2,1,2,4,6]
11327 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm4, %ymm3
11328 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8
11329 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11330 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13,0,7,10,13]
11331 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
11332 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11333 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [18446744073709551615,18446744073709551615,16777215,0]
11334 ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
11335 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11336 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
11337 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3
11338 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm0
11339 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14
11340 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3
11341 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13
11342 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm7
11343 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
11344 ; AVX2-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0
11345 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
11346 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
11347 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm9
11348 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11349 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
11350 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11351 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11352 ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11353 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11354 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11355 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm10, %ymm12, %ymm0
11356 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
11357 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm4
11358 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
11359 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
11360 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
11361 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
11362 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11363 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,2,1,3,4,6]
11364 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm11
11365 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14,1,4,11,14]
11366 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
11367 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
11368 ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11369 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11370 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm14, %ymm0
11371 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
11372 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
11373 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
11374 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
11375 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm4, %ymm1
11376 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
11377 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11378 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11379 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
11380 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11381 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11382 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm14, %ymm2, %ymm0
11383 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11384 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
11385 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
11386 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
11387 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
11388 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
11389 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
11390 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
11391 ; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm5
11392 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1
11393 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,0,7,14,128,128,0,0,0,0,7,14,128,128]
11394 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm15
11395 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm12
11396 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm12, %xmm1
11397 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
11398 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11399 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
11400 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1]
11401 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm1, %ymm1
11402 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11403 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11404 ; AVX2-FCP-NEXT: vpblendvb %ymm7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
11405 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm12
11406 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm4
11407 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
11408 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm1, %xmm6
11409 ; AVX2-FCP-NEXT: vmovdqa 208(%rdi), %xmm2
11410 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm10
11411 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11412 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8
11413 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm11
11414 ; AVX2-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10
11415 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
11416 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
11417 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm10, %ymm1
11418 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11419 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11420 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm6
11421 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm14
11422 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm3
11423 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
11424 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm11
11425 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
11426 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
11427 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6
11428 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6
11429 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
11430 ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
11431 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm13
11432 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,0,0,1,8,15,128,128,0,0,0,1,8,15,128,128]
11433 ; AVX2-FCP-NEXT: vmovdqa %xmm15, %xmm7
11434 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15
11435 ; AVX2-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
11436 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
11437 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
11438 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm13, %ymm0
11439 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11440 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11441 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11442 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm6
11443 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10
11444 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
11445 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6
11446 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
11447 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm10
11448 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1
11449 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1
11450 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
11451 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11452 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm1, %ymm1
11453 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11454 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
11455 ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm3
11456 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11457 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm14, %ymm1
11458 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
11459 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm10
11460 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
11461 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
11462 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
11463 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm10, %xmm1
11464 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
11465 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm13
11466 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,0,128,128,0,7,14,0,0,0,128,128,0,7,14]
11467 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm11
11468 ; AVX2-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11
11469 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11470 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
11471 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm11, %ymm1
11472 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11473 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm1
11474 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm13
11475 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm6
11476 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
11477 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
11478 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
11479 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm6
11480 ; AVX2-FCP-NEXT: vmovdqa %xmm8, %xmm14
11481 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11482 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm10
11483 ; AVX2-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
11484 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11485 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
11486 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm6, %ymm0
11487 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11488 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11489 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm1
11490 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11491 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0]
11492 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm1
11493 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
11494 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
11495 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
11496 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
11497 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
11498 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
11499 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
11500 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm5
11501 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,1,8,15,0,0,0,128,128,1,8,15]
11502 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
11503 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm8
11504 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
11505 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11506 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
11507 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm0
11508 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
11509 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm0
11510 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm13, %ymm4
11511 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11512 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm1
11513 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
11514 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
11515 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
11516 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
11517 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm2
11518 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11519 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm3
11520 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
11521 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11522 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
11523 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm0
11524 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11525 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11526 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11527 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11528 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm11
11529 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535]
11530 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm9
11531 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm10
11532 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11533 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm2
11534 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11535 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,65535,0,0,0,65535,0,0,65535,0,0,0,65535]
11536 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
11537 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11538 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11539 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11540 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2
11541 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11542 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm3
11543 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm8
11544 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm15
11545 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2
11546 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11547 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
11548 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11549 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11550 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11551 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,0,0,65535,0,0,0,65535,0,0,65535,0]
11552 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm1
11553 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm4
11554 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm2, %ymm0, %ymm14
11555 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm2, %ymm13
11556 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11557 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
11558 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11559 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm13
11560 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11561 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11562 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm5, %ymm2
11563 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm12
11564 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm0, %ymm5, %ymm15
11565 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm5, %ymm0, %ymm6
11566 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11567 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm5, %ymm0, %ymm0
11568 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11569 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
11570 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm0
11571 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
11572 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
11573 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11
11574 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm11, %xmm0
11575 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11
11576 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7,8,9,10],ymm11[11],ymm1[12,13],ymm11[14],ymm1[15]
11577 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11,2,9,0,0,0,0,0,0,0,1,8,15,6,13,4,11]
11578 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
11579 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
11580 ; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm11 = [18446744073709551615,255]
11581 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
11582 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11583 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm0
11584 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1
11585 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
11586 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
11587 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1
11588 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
11589 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
11590 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
11591 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11592 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
11593 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm1
11594 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm2
11595 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
11596 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
11597 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
11598 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2
11599 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5,6],ymm2[7,8],ymm4[9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
11600 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12,3,10,0,0,0,0,0,0,0,2,9,0,7,14,5,12]
11601 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
11602 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
11603 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm9
11604 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm0
11605 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm1
11606 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11607 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
11608 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm1
11609 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5,6],ymm1[7,8],ymm12[9,10],ymm1[11],ymm12[12,13,14],ymm1[15]
11610 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
11611 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm12
11612 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
11613 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
11614 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11615 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
11616 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm3
11617 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
11618 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3
11619 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4],ymm14[5,6],ymm3[7,8],ymm14[9,10,11],ymm3[12],ymm14[13,14],ymm3[15]
11620 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13,4,11,0,0,0,0,0,0,0,3,10,1,8,15,6,13]
11621 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
11622 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11623 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm14
11624 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11625 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0
11626 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11627 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm1
11628 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
11629 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
11630 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1,2,3],ymm1[4],ymm15[5,6],ymm1[7,8],ymm15[9,10,11],ymm1[12],ymm15[13,14],ymm1[15]
11631 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
11632 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm15
11633 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11634 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0
11635 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
11636 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11637 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
11638 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
11639 ; AVX2-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
11640 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11641 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm3
11642 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
11643 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14,5,12,0,0,0,0,0,0,0,4,11,2,9,0,7,14]
11644 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
11645 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
11646 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0
11647 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11648 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3
11649 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
11650 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
11651 ; AVX2-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
11652 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11653 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm2
11654 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
11655 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
11656 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
11657 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
11658 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11659 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm2
11660 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
11661 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
11662 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11663 ; AVX2-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
11664 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11665 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm4
11666 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
11667 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15,6,13,0,0,0,0,0,0,0,5,12,3,10,1,8,15]
11668 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1]
11669 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
11670 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm2, %ymm4, %ymm2
11671 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11672 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
11673 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
11674 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11675 ; AVX2-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
11676 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11677 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4
11678 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
11679 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
11680 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
11681 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11682 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm4
11683 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
11684 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11685 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
11686 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
11687 ; AVX2-FCP-NEXT: vpor %xmm4, %xmm7, %xmm4
11688 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
11689 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,1,2,1,3,5,6]
11690 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
11691 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15,2,5,8,15]
11692 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
11693 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
11694 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11695 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15]
11696 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
11697 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11698 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8
11699 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
11700 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
11701 ; AVX2-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
11702 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
11703 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
11704 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
11705 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
11706 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11707 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
11708 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
11709 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
11710 ; AVX2-FCP-NEXT: # ymm6 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15]
11711 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
11712 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
11713 ; AVX2-FCP-NEXT: # ymm7 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15]
11714 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7]
11715 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
11716 ; AVX2-FCP-NEXT: # ymm8 = ymm14[0],mem[1,2,3,4,5,6,7],ymm14[8],mem[9,10,11,12,13,14,15]
11717 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
11718 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload
11719 ; AVX2-FCP-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15]
11720 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
11721 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
11722 ; AVX2-FCP-NEXT: # ymm10 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15]
11723 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11724 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11725 ; AVX2-FCP-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15]
11726 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
11727 ; AVX2-FCP-NEXT: vpblendw $254, (%rsp), %ymm2, %ymm10 # 32-byte Folded Reload
11728 ; AVX2-FCP-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15]
11729 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
11730 ; AVX2-FCP-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
11731 ; AVX2-FCP-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15]
11732 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7]
11733 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11734 ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rsi)
11735 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11736 ; AVX2-FCP-NEXT: vmovaps %ymm10, (%rsi)
11737 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11738 ; AVX2-FCP-NEXT: vmovaps %ymm10, 32(%rdx)
11739 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11740 ; AVX2-FCP-NEXT: vmovaps %ymm10, (%rdx)
11741 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx)
11742 ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rcx)
11743 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r8)
11744 ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r8)
11745 ; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%r9)
11746 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r9)
11747 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11748 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax)
11749 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
11750 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11751 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax)
11752 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rax)
11753 ; AVX2-FCP-NEXT: addq $776, %rsp # imm = 0x308
11754 ; AVX2-FCP-NEXT: vzeroupper
11755 ; AVX2-FCP-NEXT: retq
11757 ; AVX512-LABEL: load_i8_stride7_vf64:
11759 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
11760 ; AVX512-NEXT: vmovdqa (%rdi), %ymm12
11761 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13
11762 ; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm31
11763 ; AVX512-NEXT: vmovdqa %ymm0, %ymm1
11764 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24
11765 ; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm1
11766 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
11767 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
11768 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
11769 ; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
11770 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
11771 ; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm19
11772 ; AVX512-NEXT: vmovdqa %ymm9, %ymm2
11773 ; AVX512-NEXT: vpternlogq $202, %ymm31, %ymm19, %ymm2
11774 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
11775 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
11776 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11777 ; AVX512-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
11778 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
11779 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm21
11780 ; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm29
11781 ; AVX512-NEXT: vmovdqa %ymm14, %ymm1
11782 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm1
11783 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
11784 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
11785 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
11786 ; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1
11787 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11788 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0
11789 ; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
11790 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm3
11791 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27
11792 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20
11793 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm10
11794 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
11795 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
11796 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11797 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11798 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm0
11799 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u]
11800 ; AVX512-NEXT: vmovdqa %xmm0, %xmm3
11801 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm0
11802 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
11803 ; AVX512-NEXT: vmovdqa %xmm0, %xmm6
11804 ; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5
11805 ; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm22
11806 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
11807 ; AVX512-NEXT: vpternlogq $184, %zmm2, %zmm4, %zmm22
11808 ; AVX512-NEXT: vmovdqa64 288(%rdi), %ymm18
11809 ; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm16
11810 ; AVX512-NEXT: vmovdqa %ymm9, %ymm2
11811 ; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm2
11812 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
11813 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
11814 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
11815 ; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2
11816 ; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm17
11817 ; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm28
11818 ; AVX512-NEXT: vmovdqa %ymm14, %ymm7
11819 ; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm28, %ymm7
11820 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
11821 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
11822 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
11823 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
11824 ; AVX512-NEXT: vpternlogq $248, %ymm23, %ymm2, %ymm8
11825 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
11826 ; AVX512-NEXT: vmovdqa %ymm7, %ymm2
11827 ; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2
11828 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15
11829 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
11830 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
11831 ; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2
11832 ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15
11833 ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5
11834 ; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm31, %ymm15
11835 ; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
11836 ; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
11837 ; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm15
11838 ; AVX512-NEXT: vmovdqa %ymm9, %ymm2
11839 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm2
11840 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
11841 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
11842 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
11843 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
11844 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
11845 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25
11846 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
11847 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26
11848 ; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
11849 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
11850 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1
11851 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
11852 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
11853 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11854 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11855 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11856 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm24
11857 ; AVX512-NEXT: vpternlogq $184, %zmm15, %zmm4, %zmm24
11858 ; AVX512-NEXT: vmovdqa %ymm14, %ymm0
11859 ; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0
11860 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
11861 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
11862 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
11863 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
11864 ; AVX512-NEXT: vmovdqa %ymm7, %ymm2
11865 ; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm31, %ymm2
11866 ; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11867 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
11868 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11869 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
11870 ; AVX512-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm2
11871 ; AVX512-NEXT: vmovdqa %ymm3, %ymm15
11872 ; AVX512-NEXT: vmovdqa %ymm5, %ymm0
11873 ; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm0
11874 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
11875 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
11876 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
11877 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
11878 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11879 ; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11880 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
11881 ; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm3
11882 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
11883 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5
11884 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
11885 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
11886 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11887 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
11888 ; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11889 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
11890 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
11891 ; AVX512-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11892 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
11893 ; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3
11894 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25
11895 ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
11896 ; AVX512-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm25
11897 ; AVX512-NEXT: vmovdqa %ymm9, %ymm0
11898 ; AVX512-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0
11899 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
11900 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
11901 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
11902 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
11903 ; AVX512-NEXT: vmovdqa %ymm14, %ymm2
11904 ; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm31, %ymm2
11905 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
11906 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11907 ; AVX512-NEXT: vpternlogq $248, %ymm15, %ymm0, %ymm3
11908 ; AVX512-NEXT: vmovdqa %ymm15, %ymm11
11909 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0
11910 ; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm0
11911 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
11912 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
11913 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
11914 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
11915 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11916 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
11917 ; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6
11918 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
11919 ; AVX512-NEXT: vmovdqa %xmm5, %xmm10
11920 ; AVX512-NEXT: vpor %xmm6, %xmm15, %xmm6
11921 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
11922 ; AVX512-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm6
11923 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
11924 ; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
11925 ; AVX512-NEXT: vpor %xmm0, %xmm15, %xmm0
11926 ; AVX512-NEXT: vmovdqa64 416(%rdi), %ymm26
11927 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm30
11928 ; AVX512-NEXT: vmovdqa64 384(%rdi), %ymm27
11929 ; AVX512-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm30
11930 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0
11931 ; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
11932 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
11933 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
11934 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
11935 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
11936 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11937 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
11938 ; AVX512-NEXT: vpternlogq $184, %ymm8, %ymm1, %ymm0
11939 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20
11940 ; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
11941 ; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm8, %zmm20
11942 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0
11943 ; AVX512-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm0
11944 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
11945 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
11946 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
11947 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
11948 ; AVX512-NEXT: vmovdqa %ymm9, %ymm3
11949 ; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm28, %ymm3
11950 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
11951 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
11952 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
11953 ; AVX512-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm3
11954 ; AVX512-NEXT: vmovdqa %ymm14, %ymm0
11955 ; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
11956 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
11957 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
11958 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
11959 ; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
11960 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11961 ; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm0
11962 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22
11963 ; AVX512-NEXT: vpternlogq $184, %zmm24, %zmm8, %zmm22
11964 ; AVX512-NEXT: vmovdqa %ymm14, %ymm0
11965 ; AVX512-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm0
11966 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
11967 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
11968 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
11969 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
11970 ; AVX512-NEXT: vmovdqa %ymm7, %ymm3
11971 ; AVX512-NEXT: vpternlogq $202, %ymm28, %ymm17, %ymm3
11972 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
11973 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15]
11974 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
11975 ; AVX512-NEXT: vpternlogq $248, %ymm23, %ymm0, %ymm3
11976 ; AVX512-NEXT: vmovdqa %ymm9, %ymm0
11977 ; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
11978 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
11979 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
11980 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
11981 ; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
11982 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11983 ; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm0
11984 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24
11985 ; AVX512-NEXT: vpternlogq $184, %zmm25, %zmm8, %zmm24
11986 ; AVX512-NEXT: vmovdqa %ymm14, %ymm0
11987 ; AVX512-NEXT: vpternlogq $202, %ymm28, %ymm17, %ymm0
11988 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
11989 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
11990 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
11991 ; AVX512-NEXT: vmovdqa %ymm9, %ymm2
11992 ; AVX512-NEXT: vpternlogq $202, %ymm16, %ymm18, %ymm2
11993 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
11994 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
11995 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
11996 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
11997 ; AVX512-NEXT: vpternlogq $236, %ymm23, %ymm0, %ymm2
11998 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0
11999 ; AVX512-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12000 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
12001 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
12002 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
12003 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
12004 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12005 ; AVX512-NEXT: vpternlogq $184, %ymm2, %ymm1, %ymm0
12006 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25
12007 ; AVX512-NEXT: vpternlogq $184, %zmm30, %zmm8, %zmm25
12008 ; AVX512-NEXT: vmovdqa %ymm9, %ymm0
12009 ; AVX512-NEXT: vpternlogq $202, %ymm28, %ymm17, %ymm0
12010 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12011 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
12012 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12013 ; AVX512-NEXT: vmovdqa %ymm4, %ymm2
12014 ; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm2
12015 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
12016 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
12017 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
12018 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
12019 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12020 ; AVX512-NEXT: vpternlogq $236, %ymm23, %ymm0, %ymm2
12021 ; AVX512-NEXT: vmovdqa %ymm14, %ymm0
12022 ; AVX512-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12023 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
12024 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
12025 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
12026 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
12027 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30
12028 ; AVX512-NEXT: vpternlogq $184, %ymm2, %ymm1, %ymm30
12029 ; AVX512-NEXT: vmovdqa %ymm4, %ymm0
12030 ; AVX512-NEXT: vpternlogq $202, %ymm17, %ymm28, %ymm0
12031 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12032 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
12033 ; AVX512-NEXT: vmovdqa %ymm7, %ymm2
12034 ; AVX512-NEXT: vpternlogq $202, %ymm18, %ymm16, %ymm2
12035 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
12036 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
12037 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
12038 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
12039 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
12040 ; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm0
12041 ; AVX512-NEXT: vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
12042 ; AVX512-NEXT: vmovdqa %ymm9, %ymm0
12043 ; AVX512-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12044 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
12045 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
12046 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
12047 ; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
12048 ; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23
12049 ; AVX512-NEXT: vpternlogq $184, %ymm2, %ymm1, %ymm23
12050 ; AVX512-NEXT: vmovdqa %ymm7, %ymm0
12051 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm0
12052 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
12053 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
12054 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
12055 ; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
12056 ; AVX512-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm4
12057 ; AVX512-NEXT: vmovdqa %ymm14, %ymm2
12058 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm2
12059 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
12060 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
12061 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
12062 ; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
12063 ; AVX512-NEXT: vmovdqa %ymm9, %ymm3
12064 ; AVX512-NEXT: vmovdqa %ymm9, %ymm15
12065 ; AVX512-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm9
12066 ; AVX512-NEXT: vpternlogq $202, %ymm19, %ymm31, %ymm3
12067 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
12068 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9
12069 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u]
12070 ; AVX512-NEXT: vpor %xmm6, %xmm9, %xmm6
12071 ; AVX512-NEXT: vmovdqa %ymm14, %ymm12
12072 ; AVX512-NEXT: vpternlogq $226, %ymm18, %ymm14, %ymm16
12073 ; AVX512-NEXT: vmovdqa %ymm7, %ymm9
12074 ; AVX512-NEXT: vpternlogq $202, %ymm31, %ymm19, %ymm9
12075 ; AVX512-NEXT: vpternlogq $202, %ymm31, %ymm19, %ymm14
12076 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12077 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15]
12078 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12079 ; AVX512-NEXT: vmovdqa %ymm11, %ymm1
12080 ; AVX512-NEXT: vpternlogq $248, %ymm11, %ymm0, %ymm3
12081 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
12082 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12083 ; AVX512-NEXT: vpternlogq $248, %ymm1, %ymm2, %ymm11
12084 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15]
12085 ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12086 ; AVX512-NEXT: vpternlogq $248, %ymm1, %ymm6, %ymm9
12087 ; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm12
12088 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
12089 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2
12090 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
12091 ; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
12092 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12093 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12094 ; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm2
12095 ; AVX512-NEXT: vmovdqa %xmm10, %xmm13
12096 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12097 ; AVX512-NEXT: vpor %xmm2, %xmm6, %xmm2
12098 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12099 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12100 ; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12101 ; AVX512-NEXT: vpternlogq $184, %ymm0, %ymm1, %ymm2
12102 ; AVX512-NEXT: vpternlogq $226, %ymm17, %ymm7, %ymm28
12103 ; AVX512-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
12104 ; AVX512-NEXT: vpternlogq $202, %ymm29, %ymm21, %ymm15
12105 ; AVX512-NEXT: vpternlogq $202, %ymm21, %ymm29, %ymm7
12106 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12107 ; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm0
12108 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12109 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12110 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
12111 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
12112 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12113 ; AVX512-NEXT: vpternlogq $184, %zmm3, %zmm0, %zmm2
12114 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
12115 ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm6
12116 ; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
12117 ; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3
12118 ; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm6
12119 ; AVX512-NEXT: vmovdqa %xmm12, %xmm15
12120 ; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12121 ; AVX512-NEXT: vpor %xmm6, %xmm12, %xmm6
12122 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12123 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12124 ; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm6
12125 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3
12126 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
12127 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
12128 ; AVX512-NEXT: vpor %xmm3, %xmm7, %xmm3
12129 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12130 ; AVX512-NEXT: vpshufb %xmm7, %xmm15, %xmm12
12131 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12132 ; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
12133 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12134 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
12135 ; AVX512-NEXT: vpternlogq $184, %ymm3, %ymm1, %ymm12
12136 ; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12137 ; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12138 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
12139 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
12140 ; AVX512-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm3
12141 ; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm6
12142 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12143 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
12144 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6
12145 ; AVX512-NEXT: vpternlogq $184, %zmm9, %zmm0, %zmm6
12146 ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0
12147 ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00
12148 ; AVX512-NEXT: kmovw %eax, %k1
12149 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
12150 ; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm0
12151 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
12152 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1
12153 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1]
12154 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12155 ; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0
12156 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1
12157 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
12158 ; AVX512-NEXT: vextracti32x4 $1, %ymm16, %xmm1
12159 ; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
12160 ; AVX512-NEXT: vpor %xmm5, %xmm1, %xmm1
12161 ; AVX512-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
12162 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0
12163 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
12164 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
12165 ; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0
12166 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12167 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
12168 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12169 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
12170 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
12171 ; AVX512-NEXT: vmovdqa64 %zmm20, (%rsi)
12172 ; AVX512-NEXT: vmovdqa64 %zmm22, (%rdx)
12173 ; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx)
12174 ; AVX512-NEXT: vmovdqa64 %zmm25, (%r8)
12175 ; AVX512-NEXT: vmovdqa64 %zmm2, (%r9)
12176 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
12177 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
12178 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
12179 ; AVX512-NEXT: vmovdqa64 %zmm6, (%rax)
12180 ; AVX512-NEXT: vzeroupper
12181 ; AVX512-NEXT: retq
12183 ; AVX512-FCP-LABEL: load_i8_stride7_vf64:
12184 ; AVX512-FCP: # %bb.0:
12185 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12186 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm19
12187 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
12188 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30
12189 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
12190 ; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7
12191 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm1
12192 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
12193 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
12194 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
12195 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
12196 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
12197 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
12198 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
12199 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm27, %ymm2
12200 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm9
12201 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15]
12202 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12203 ; AVX512-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
12204 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
12205 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm31
12206 ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm29
12207 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1
12208 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm1
12209 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
12210 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
12211 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
12212 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
12213 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12214 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
12215 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
12216 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2
12217 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
12218 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12219 ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm3
12220 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
12221 ; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm8
12222 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm0
12223 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
12224 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
12225 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20
12226 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
12227 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm20
12228 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
12229 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %ymm26
12230 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm4
12231 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm4
12232 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
12233 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
12234 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
12235 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13
12236 ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm15
12237 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16
12238 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm5
12239 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5
12240 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1]
12241 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15]
12242 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
12243 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
12244 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm21, %ymm13, %ymm5
12245 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
12246 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm13
12247 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm13
12248 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
12249 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
12250 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
12251 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3
12252 ; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1
12253 ; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13
12254 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm27, %ymm30, %ymm13
12255 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15]
12256 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
12257 ; AVX512-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm13
12258 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
12259 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm3
12260 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u]
12261 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
12262 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u]
12263 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12264 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12265 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
12266 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
12267 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
12268 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
12269 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u]
12270 ; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm23
12271 ; AVX512-FCP-NEXT: vmovdqa %xmm0, %xmm4
12272 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
12273 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
12274 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm22
12275 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm13, %zmm2, %zmm22
12276 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
12277 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm2
12278 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
12279 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
12280 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
12281 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
12282 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
12283 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm27, %ymm30, %ymm3
12284 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15]
12285 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12286 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
12287 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3
12288 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2
12289 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm2
12290 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
12291 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u]
12292 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
12293 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
12294 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12295 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6]
12296 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
12297 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
12298 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7]
12299 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
12300 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12301 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1
12302 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12303 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
12304 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
12305 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm23
12306 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
12307 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm23
12308 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
12309 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm2
12310 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
12311 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
12312 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
12313 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
12314 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
12315 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm27, %ymm30, %ymm3
12316 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15]
12317 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12318 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3
12319 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
12320 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm2
12321 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
12322 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
12323 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
12324 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
12325 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
12326 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
12327 ; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
12328 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12329 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8
12330 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
12331 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12332 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
12333 ; AVX512-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8
12334 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12335 ; AVX512-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8
12336 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
12337 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
12338 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
12339 ; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24
12340 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28
12341 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25
12342 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm28
12343 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
12344 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
12345 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
12346 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
12347 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero
12348 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12349 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12350 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
12351 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm5, %ymm0, %ymm3
12352 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18
12353 ; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
12354 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm18
12355 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
12356 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm3
12357 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
12358 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u]
12359 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
12360 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12361 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7
12362 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm7
12363 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
12364 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15]
12365 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
12366 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm21, %ymm3, %ymm7
12367 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
12368 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
12369 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
12370 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
12371 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
12372 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
12373 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12374 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm7, %ymm0, %ymm3
12375 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
12376 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm22, %zmm5, %zmm20
12377 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
12378 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm3
12379 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
12380 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u]
12381 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
12382 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12383 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm7
12384 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm7
12385 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
12386 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15]
12387 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
12388 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm21, %ymm3, %ymm7
12389 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
12390 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
12391 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
12392 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
12393 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
12394 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
12395 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12396 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm7, %ymm0, %ymm3
12397 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
12398 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm23, %zmm5, %zmm22
12399 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
12400 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm3
12401 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
12402 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15]
12403 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12404 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
12405 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm3
12406 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
12407 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
12408 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
12409 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12410 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm21, %ymm2, %ymm3
12411 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
12412 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
12413 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
12414 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
12415 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero
12416 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
12417 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12418 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm3, %ymm0, %ymm2
12419 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm23
12420 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm28, %zmm5, %zmm23
12421 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
12422 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm2
12423 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
12424 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
12425 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12426 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
12427 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm3
12428 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
12429 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
12430 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
12431 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
12432 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12433 ; AVX512-FCP-NEXT: vpternlogq $236, %ymm21, %ymm2, %ymm3
12434 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
12435 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
12436 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
12437 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
12438 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
12439 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
12440 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
12441 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm3, %ymm0, %ymm4
12442 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2
12443 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
12444 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm2
12445 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
12446 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
12447 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
12448 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm3
12449 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
12450 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
12451 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
12452 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
12453 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
12454 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12455 ; AVX512-FCP-NEXT: vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
12456 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
12457 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
12458 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
12459 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
12460 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
12461 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
12462 ; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28
12463 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm3, %ymm0, %ymm28
12464 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm2
12465 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm2
12466 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
12467 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
12468 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
12469 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
12470 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm21
12471 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
12472 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm3
12473 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
12474 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u]
12475 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
12476 ; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
12477 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm7
12478 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm13
12479 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm6
12480 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm27, %ymm30, %ymm7
12481 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
12482 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
12483 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u]
12484 ; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8
12485 ; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm10
12486 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm11, %ymm12, %ymm26
12487 ; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm6
12488 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm27, %ymm6
12489 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm30, %ymm27, %ymm12
12490 ; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
12491 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3],ymm9[4],ymm7[5,6],ymm9[7,8],ymm7[9,10,11],ymm9[12],ymm7[13,14],ymm9[15]
12492 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12493 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm9
12494 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm0[1],ymm6[2,3],ymm0[4],ymm6[5,6,7,8],ymm0[9],ymm6[10,11],ymm0[12],ymm6[13,14,15]
12495 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12496 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm3, %ymm6
12497 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
12498 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12499 ; AVX512-FCP-NEXT: vpternlogq $248, %ymm17, %ymm8, %ymm7
12500 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm10
12501 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u]
12502 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3
12503 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
12504 ; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
12505 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12506 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12507 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm3
12508 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12509 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12510 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
12511 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12512 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12513 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12514 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm3
12515 ; AVX512-FCP-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
12516 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
12517 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm29, %ymm31, %ymm13
12518 ; AVX512-FCP-NEXT: vpternlogq $202, %ymm31, %ymm29, %ymm14
12519 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12520 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2
12521 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12522 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12523 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
12524 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
12525 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12526 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm9, %zmm3, %zmm2
12527 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
12528 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10
12529 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
12530 ; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
12531 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm5
12532 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12533 ; AVX512-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
12534 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
12535 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12536 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm9, %ymm11, %ymm5
12537 ; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13
12538 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm9
12539 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u]
12540 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u]
12541 ; AVX512-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
12542 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12543 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm11
12544 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12545 ; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
12546 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
12547 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
12548 ; AVX512-FCP-NEXT: vpternlogq $184, %ymm9, %ymm13, %ymm11
12549 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12550 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12551 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
12552 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm5, %zmm5
12553 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm6, %zmm3, %zmm5
12554 ; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm6
12555 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12556 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
12557 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm11, %zmm6
12558 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm7, %zmm3, %zmm6
12559 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3
12560 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
12561 ; AVX512-FCP-NEXT: kmovw %eax, %k1
12562 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm2 {%k1}
12563 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm3
12564 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1}
12565 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
12566 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm16[2,3,0,1]
12567 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
12568 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
12569 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
12570 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
12571 ; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm26, %xmm1
12572 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
12573 ; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
12574 ; AVX512-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm1
12575 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
12576 ; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm3
12577 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
12578 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
12579 ; AVX512-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
12580 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12581 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
12582 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12583 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
12584 ; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
12585 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
12586 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rdx)
12587 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%rcx)
12588 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
12589 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
12590 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12591 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
12592 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12593 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
12594 ; AVX512-FCP-NEXT: vzeroupper
12595 ; AVX512-FCP-NEXT: retq
12597 ; AVX512DQ-LABEL: load_i8_stride7_vf64:
12598 ; AVX512DQ: # %bb.0:
12599 ; AVX512DQ-NEXT: subq $24, %rsp
12600 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12601 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12
12602 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm13
12603 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm31
12604 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
12605 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23
12606 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm1
12607 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
12608 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
12609 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
12610 ; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
12611 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
12612 ; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm28
12613 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
12614 ; AVX512DQ-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm2
12615 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11
12616 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
12617 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12618 ; AVX512DQ-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
12619 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
12620 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0
12621 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3
12622 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1
12623 ; AVX512DQ-NEXT: vpternlogq $202, %ymm0, %ymm3, %ymm1
12624 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm4
12625 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25
12626 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
12627 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
12628 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
12629 ; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1
12630 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12631 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0
12632 ; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
12633 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm3
12634 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30
12635 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20
12636 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm10
12637 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
12638 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
12639 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12640 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
12641 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm0
12642 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u]
12643 ; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm3
12644 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm0
12645 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
12646 ; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm5
12647 ; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
12648 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm22
12649 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
12650 ; AVX512DQ-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm22
12651 ; AVX512DQ-NEXT: vmovdqa64 288(%rdi), %ymm18
12652 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %ymm17
12653 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
12654 ; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm17, %ymm2
12655 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
12656 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
12657 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
12658 ; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm2
12659 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21
12660 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm16
12661 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7
12662 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm16, %ymm7
12663 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
12664 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
12665 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
12666 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
12667 ; AVX512DQ-NEXT: vpternlogq $248, %ymm24, %ymm2, %ymm8
12668 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
12669 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
12670 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm2
12671 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15
12672 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
12673 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
12674 ; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2
12675 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15
12676 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm29
12677 ; AVX512DQ-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm15
12678 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
12679 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
12680 ; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm15
12681 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
12682 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6
12683 ; AVX512DQ-NEXT: vpternlogq $202, %ymm25, %ymm4, %ymm2
12684 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
12685 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
12686 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
12687 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
12688 ; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4
12689 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
12690 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
12691 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm26
12692 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
12693 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
12694 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1
12695 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
12696 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12697 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12698 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12699 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
12700 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm23
12701 ; AVX512DQ-NEXT: vpternlogq $184, %zmm15, %zmm19, %zmm23
12702 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12703 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0
12704 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
12705 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
12706 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
12707 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
12708 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
12709 ; AVX512DQ-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm2
12710 ; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12711 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
12712 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12713 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm3 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
12714 ; AVX512DQ-NEXT: vpternlogq $248, %ymm3, %ymm0, %ymm2
12715 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27
12716 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0
12717 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15
12718 ; AVX512DQ-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12719 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm25, %ymm0
12720 ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19
12721 ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12722 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12723 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
12724 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
12725 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12726 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12727 ; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3
12728 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm3
12729 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
12730 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6
12731 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12732 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12733 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12734 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
12735 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
12736 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12737 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
12738 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
12739 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12740 ; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
12741 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25
12742 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
12743 ; AVX512DQ-NEXT: vpternlogq $226, %zmm2, %zmm20, %zmm25
12744 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
12745 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm12, %ymm0
12746 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
12747 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
12748 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
12749 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
12750 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm2
12751 ; AVX512DQ-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm2
12752 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
12753 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12754 ; AVX512DQ-NEXT: vpternlogq $248, %ymm27, %ymm0, %ymm3
12755 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm11
12756 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
12757 ; AVX512DQ-NEXT: vpternlogq $202, %ymm19, %ymm15, %ymm0
12758 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
12759 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
12760 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
12761 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
12762 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12763 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
12764 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm5
12765 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
12766 ; AVX512DQ-NEXT: vpor %xmm5, %xmm15, %xmm5
12767 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12768 ; AVX512DQ-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5
12769 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
12770 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
12771 ; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0
12772 ; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm26
12773 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm30
12774 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm27
12775 ; AVX512DQ-NEXT: vpternlogq $226, %zmm3, %zmm20, %zmm30
12776 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
12777 ; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
12778 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12779 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
12780 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
12781 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12782 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12783 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
12784 ; AVX512DQ-NEXT: vpternlogq $184, %ymm8, %ymm29, %ymm0
12785 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20
12786 ; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
12787 ; AVX512DQ-NEXT: vpternlogq $184, %zmm22, %zmm8, %zmm20
12788 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
12789 ; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm18, %ymm0
12790 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12791 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
12792 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
12793 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12794 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3
12795 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm16, %ymm3
12796 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
12797 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
12798 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
12799 ; AVX512DQ-NEXT: vpternlogq $248, %ymm24, %ymm0, %ymm3
12800 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12801 ; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
12802 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
12803 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
12804 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
12805 ; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
12806 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12807 ; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm29, %ymm0
12808 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22
12809 ; AVX512DQ-NEXT: vpternlogq $184, %zmm23, %zmm8, %zmm22
12810 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12811 ; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm18, %ymm0
12812 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12813 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
12814 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
12815 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12816 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm3
12817 ; AVX512DQ-NEXT: vpternlogq $202, %ymm16, %ymm21, %ymm3
12818 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
12819 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15]
12820 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
12821 ; AVX512DQ-NEXT: vpternlogq $248, %ymm24, %ymm0, %ymm3
12822 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
12823 ; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm0
12824 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
12825 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
12826 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
12827 ; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
12828 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12829 ; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm29, %ymm0
12830 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23
12831 ; AVX512DQ-NEXT: vpternlogq $184, %zmm25, %zmm8, %zmm23
12832 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12833 ; AVX512DQ-NEXT: vpternlogq $202, %ymm16, %ymm21, %ymm0
12834 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
12835 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
12836 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12837 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
12838 ; AVX512DQ-NEXT: vpternlogq $202, %ymm17, %ymm18, %ymm2
12839 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
12840 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
12841 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
12842 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
12843 ; AVX512DQ-NEXT: vpternlogq $236, %ymm24, %ymm0, %ymm2
12844 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
12845 ; AVX512DQ-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12846 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12847 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
12848 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
12849 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12850 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12851 ; AVX512DQ-NEXT: vpternlogq $184, %ymm2, %ymm29, %ymm0
12852 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25
12853 ; AVX512DQ-NEXT: vpternlogq $184, %zmm30, %zmm8, %zmm25
12854 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
12855 ; AVX512DQ-NEXT: vpternlogq $202, %ymm16, %ymm21, %ymm0
12856 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12857 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
12858 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
12859 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2
12860 ; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm17, %ymm2
12861 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
12862 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
12863 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
12864 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
12865 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12866 ; AVX512DQ-NEXT: vpternlogq $236, %ymm24, %ymm0, %ymm2
12867 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12868 ; AVX512DQ-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12869 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
12870 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
12871 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
12872 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12873 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24
12874 ; AVX512DQ-NEXT: vpternlogq $184, %ymm2, %ymm29, %ymm24
12875 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
12876 ; AVX512DQ-NEXT: vpternlogq $202, %ymm21, %ymm16, %ymm0
12877 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12878 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
12879 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
12880 ; AVX512DQ-NEXT: vpternlogq $202, %ymm18, %ymm17, %ymm2
12881 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
12882 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
12883 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
12884 ; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
12885 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
12886 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
12887 ; AVX512DQ-NEXT: vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
12888 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
12889 ; AVX512DQ-NEXT: vpternlogq $202, %ymm27, %ymm26, %ymm0
12890 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
12891 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
12892 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
12893 ; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
12894 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30
12895 ; AVX512DQ-NEXT: vpternlogq $184, %ymm2, %ymm29, %ymm30
12896 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
12897 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm0
12898 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
12899 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
12900 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
12901 ; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm29
12902 ; AVX512DQ-NEXT: vpternlogq $202, %ymm26, %ymm27, %ymm19
12903 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
12904 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm0
12905 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
12906 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u]
12907 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
12908 ; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
12909 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
12910 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm15
12911 ; AVX512DQ-NEXT: vpternlogq $202, %ymm12, %ymm13, %ymm9
12912 ; AVX512DQ-NEXT: vpternlogq $202, %ymm28, %ymm31, %ymm2
12913 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
12914 ; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm5
12915 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
12916 ; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
12917 ; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5
12918 ; AVX512DQ-NEXT: vpternlogq $226, %ymm18, %ymm14, %ymm17
12919 ; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm9
12920 ; AVX512DQ-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm9
12921 ; AVX512DQ-NEXT: vpternlogq $202, %ymm31, %ymm28, %ymm14
12922 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12923 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15]
12924 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12925 ; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1
12926 ; AVX512DQ-NEXT: vpternlogq $248, %ymm11, %ymm29, %ymm12
12927 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15]
12928 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12929 ; AVX512DQ-NEXT: vpternlogq $248, %ymm1, %ymm0, %ymm11
12930 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15]
12931 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
12932 ; AVX512DQ-NEXT: vpternlogq $248, %ymm1, %ymm3, %ymm9
12933 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12934 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12935 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm5
12936 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u]
12937 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm2
12938 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
12939 ; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
12940 ; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1
12941 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
12942 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2
12943 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12944 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
12945 ; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
12946 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12947 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12948 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
12949 ; AVX512DQ-NEXT: vpternlogq $184, %ymm0, %ymm18, %ymm2
12950 ; AVX512DQ-NEXT: vpternlogq $226, %ymm21, %ymm7, %ymm16
12951 ; AVX512DQ-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
12952 ; AVX512DQ-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm15
12953 ; AVX512DQ-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm7
12954 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12955 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0
12956 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12957 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12958 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
12959 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
12960 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
12961 ; AVX512DQ-NEXT: vpternlogq $184, %zmm12, %zmm0, %zmm2
12962 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
12963 ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm5
12964 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u]
12965 ; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
12966 ; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm5
12967 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
12968 ; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5
12969 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12970 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12971 ; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm18, %ymm5
12972 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm3
12973 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
12974 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
12975 ; AVX512DQ-NEXT: vpor %xmm3, %xmm7, %xmm3
12976 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
12977 ; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm8
12978 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
12979 ; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
12980 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12981 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12982 ; AVX512DQ-NEXT: vpternlogq $184, %ymm3, %ymm18, %ymm8
12983 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12984 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12985 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
12986 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
12987 ; AVX512DQ-NEXT: vpternlogq $184, %zmm11, %zmm0, %zmm3
12988 ; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm5
12989 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
12990 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
12991 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5
12992 ; AVX512DQ-NEXT: vpternlogq $184, %zmm9, %zmm0, %zmm5
12993 ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
12994 ; AVX512DQ-NEXT: kmovw %eax, %k1
12995 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1}
12996 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1}
12997 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
12998 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1]
12999 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
13000 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0
13001 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1
13002 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
13003 ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm17, %xmm1
13004 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
13005 ; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1
13006 ; AVX512DQ-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
13007 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
13008 ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm19, %xmm0
13009 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
13010 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
13011 ; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0
13012 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13013 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13014 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13015 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
13016 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rsi)
13017 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rdx)
13018 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx)
13019 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r8)
13020 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9)
13021 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
13022 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
13023 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
13024 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax)
13025 ; AVX512DQ-NEXT: addq $24, %rsp
13026 ; AVX512DQ-NEXT: vzeroupper
13027 ; AVX512DQ-NEXT: retq
13029 ; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64:
13030 ; AVX512DQ-FCP: # %bb.0:
13031 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
13032 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm26
13033 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
13034 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm30
13035 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
13036 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm7
13037 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm1
13038 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13039 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
13040 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
13041 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
13042 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
13043 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm29
13044 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
13045 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm29, %ymm2
13046 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm9
13047 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4],ymm9[5],ymm2[6,7,8,9],ymm9[10],ymm2[11,12],ymm9[13],ymm2[14,15]
13048 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13049 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm4
13050 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
13051 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
13052 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
13053 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1
13054 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm31, %ymm1
13055 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24
13056 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13057 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
13058 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
13059 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
13060 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13061 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
13062 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
13063 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm2, %ymm2
13064 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
13065 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13066 ; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm3
13067 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
13068 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm8
13069 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm0
13070 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
13071 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
13072 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm20
13073 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
13074 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm20
13075 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm10
13076 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %ymm19
13077 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm4
13078 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm4
13079 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
13080 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
13081 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
13082 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm13
13083 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm15
13084 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm16
13085 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm5
13086 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm5
13087 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[2,3,0,1]
13088 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm14[2],ymm5[3,4,5],ymm14[6],ymm5[7,8,9],ymm14[10],ymm5[11,12,13],ymm14[14],ymm5[15]
13089 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13090 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
13091 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm23, %ymm13, %ymm5
13092 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
13093 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm13
13094 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm13
13095 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
13096 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
13097 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
13098 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm3
13099 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
13100 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13
13101 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm13
13102 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7,8,9],ymm9[10],ymm13[11,12,13],ymm9[14],ymm13[15]
13103 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13104 ; AVX512DQ-FCP-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm13
13105 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
13106 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm31, %ymm3
13107 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u]
13108 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
13109 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u]
13110 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
13111 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13112 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,4,6]
13113 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
13114 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
13115 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
13116 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u]
13117 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm22
13118 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, %xmm4
13119 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
13120 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
13121 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm21
13122 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm13, %zmm2, %zmm21
13123 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
13124 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm2
13125 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
13126 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
13127 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
13128 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
13129 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
13130 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm3
13131 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3],ymm3[4,5],ymm9[6],ymm3[7,8,9,10],ymm9[11],ymm3[12,13],ymm9[14],ymm3[15]
13132 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13133 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
13134 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3
13135 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2
13136 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13137 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm24, %ymm2
13138 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
13139 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u]
13140 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
13141 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
13142 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13143 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,1,3,5,6]
13144 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm7, %ymm7
13145 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
13146 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7]
13147 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u]
13148 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13149 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
13150 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13151 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13152 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
13153 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm22
13154 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm18 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
13155 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm22
13156 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
13157 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm11, %ymm26, %ymm2
13158 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
13159 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
13160 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
13161 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
13162 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
13163 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm3
13164 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2],ymm9[3],ymm3[4,5,6],ymm9[7,8],ymm3[9,10],ymm9[11],ymm3[12,13,14],ymm9[15]
13165 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13166 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm3
13167 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
13168 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm24, %ymm2
13169 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
13170 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
13171 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
13172 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
13173 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
13174 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
13175 ; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
13176 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13177 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm8
13178 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
13179 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13180 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13181 ; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm13, %xmm8
13182 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13183 ; AVX512DQ-FCP-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8
13184 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13185 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm1[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13186 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
13187 ; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm24
13188 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm28
13189 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm25
13190 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm3, %zmm18, %zmm28
13191 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
13192 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
13193 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
13194 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
13195 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero
13196 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
13197 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13198 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm27 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
13199 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm5, %ymm27, %ymm3
13200 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm18
13201 ; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm5 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
13202 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm20, %zmm5, %zmm18
13203 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
13204 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm3
13205 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
13206 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u]
13207 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
13208 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
13209 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
13210 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm7
13211 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
13212 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15]
13213 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13214 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm7
13215 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
13216 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
13217 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
13218 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
13219 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
13220 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
13221 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13222 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm7, %ymm27, %ymm3
13223 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
13224 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm21, %zmm5, %zmm20
13225 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
13226 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm3
13227 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
13228 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u]
13229 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
13230 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
13231 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7
13232 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm7
13233 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
13234 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2],ymm8[3],ymm7[4,5,6],ymm8[7,8],ymm7[9,10],ymm8[11],ymm7[12,13,14],ymm8[15]
13235 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
13236 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm23, %ymm3, %ymm7
13237 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
13238 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm3
13239 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
13240 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
13241 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
13242 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
13243 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13244 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm7, %ymm27, %ymm3
13245 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm21
13246 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm22, %zmm5, %zmm21
13247 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
13248 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm3
13249 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
13250 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3],ymm7[4],ymm3[5,6],ymm7[7,8],ymm3[9,10,11],ymm7[12],ymm3[13,14],ymm7[15]
13251 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
13252 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
13253 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm19, %ymm10, %ymm3
13254 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
13255 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
13256 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
13257 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
13258 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm23, %ymm2, %ymm3
13259 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
13260 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
13261 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
13262 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
13263 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero
13264 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
13265 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13266 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm3, %ymm27, %ymm2
13267 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22
13268 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm28, %zmm5, %zmm22
13269 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
13270 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm16, %ymm15, %ymm2
13271 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
13272 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
13273 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
13274 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
13275 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm3
13276 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
13277 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
13278 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
13279 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
13280 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13281 ; AVX512DQ-FCP-NEXT: vpternlogq $236, %ymm23, %ymm2, %ymm3
13282 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
13283 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
13284 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
13285 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
13286 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
13287 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm2, %xmm2
13288 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
13289 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm3, %ymm27, %ymm1
13290 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
13291 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
13292 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm15, %ymm16, %ymm2
13293 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
13294 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
13295 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
13296 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm10, %ymm19, %ymm3
13297 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
13298 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
13299 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
13300 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
13301 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
13302 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
13303 ; AVX512DQ-FCP-NEXT: vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
13304 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
13305 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm25, %ymm24, %ymm2
13306 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
13307 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
13308 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
13309 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
13310 ; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm28
13311 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm3, %ymm27, %ymm28
13312 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm2
13313 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm2
13314 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
13315 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
13316 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
13317 ; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm2, %xmm27
13318 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm24, %ymm25, %ymm23
13319 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
13320 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm2
13321 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
13322 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
13323 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
13324 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
13325 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
13326 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13
13327 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm26, %ymm11, %ymm6
13328 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm29, %ymm30, %ymm3
13329 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
13330 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
13331 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u]
13332 ; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
13333 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm8
13334 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm10, %ymm12, %ymm19
13335 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm7
13336 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm29, %ymm7
13337 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm30, %ymm29, %ymm12
13338 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15]
13339 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13340 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm27, %ymm3
13341 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14,15]
13342 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm0
13343 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13344 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm2, %ymm9
13345 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
13346 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
13347 ; AVX512DQ-FCP-NEXT: vpternlogq $248, %ymm17, %ymm6, %ymm7
13348 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13349 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm0, %ymm8
13350 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
13351 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6
13352 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
13353 ; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2
13354 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13355 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13356 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm6
13357 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
13358 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
13359 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6
13360 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13361 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
13362 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm11 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
13363 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm2, %ymm11, %ymm8
13364 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %ymm15, %ymm14, %ymm16
13365 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
13366 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm31, %ymm0, %ymm13
13367 ; AVX512DQ-FCP-NEXT: vpternlogq $202, %ymm0, %ymm31, %ymm14
13368 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13369 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm2
13370 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13371 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13372 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
13373 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
13374 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
13375 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm8, %zmm2
13376 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
13377 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm10
13378 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
13379 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
13380 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
13381 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
13382 ; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
13383 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13384 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13385 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm3, %ymm11, %ymm5
13386 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13
13387 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm3
13388 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
13389 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u]
13390 ; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3
13391 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
13392 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm11
13393 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
13394 ; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
13395 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13396 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
13397 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %ymm3, %ymm13, %ymm11
13398 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13399 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13400 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
13401 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
13402 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm8, %zmm3
13403 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm5
13404 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13405 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
13406 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm11, %zmm5
13407 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm7, %zmm8, %zmm5
13408 ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
13409 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
13410 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
13411 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm28, %zmm0, %zmm3 {%k1}
13412 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
13413 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm16[2,3,0,1]
13414 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
13415 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
13416 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
13417 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
13418 ; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm1
13419 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
13420 ; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
13421 ; AVX512DQ-FCP-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm1
13422 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
13423 ; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm23, %xmm4
13424 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15]
13425 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
13426 ; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
13427 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13428 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
13429 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13430 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
13431 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rsi)
13432 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rdx)
13433 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
13434 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
13435 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
13436 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13437 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
13438 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13439 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
13440 ; AVX512DQ-FCP-NEXT: vzeroupper
13441 ; AVX512DQ-FCP-NEXT: retq
13443 ; AVX512BW-LABEL: load_i8_stride7_vf64:
13444 ; AVX512BW: # %bb.0:
13445 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25
13446 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
13447 ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm18
13448 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
13449 ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm24
13450 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
13451 ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm9
13452 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
13453 ; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm0
13454 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10
13455 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
13456 ; AVX512BW-NEXT: movw $-28382, %ax # imm = 0x9122
13457 ; AVX512BW-NEXT: kmovd %eax, %k1
13458 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1}
13459 ; AVX512BW-NEXT: kmovq %k1, %k2
13460 ; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13461 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
13462 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
13463 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
13464 ; AVX512BW-NEXT: vporq %xmm4, %xmm3, %xmm16
13465 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
13466 ; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0
13467 ; AVX512BW-NEXT: kmovd %eax, %k1
13468 ; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1}
13469 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11
13470 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6
13471 ; AVX512BW-NEXT: movw $8772, %ax # imm = 0x2244
13472 ; AVX512BW-NEXT: kmovd %eax, %k6
13473 ; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
13474 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
13475 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
13476 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
13477 ; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
13478 ; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13479 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7
13480 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
13481 ; AVX512BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
13482 ; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm8
13483 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
13484 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
13485 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13486 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
13487 ; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm26
13488 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
13489 ; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm4
13490 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13491 ; AVX512BW-NEXT: vpor %xmm5, %xmm12, %xmm5
13492 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0
13493 ; AVX512BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
13494 ; AVX512BW-NEXT: kmovq %rax, %k5
13495 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5}
13496 ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm13
13497 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm12
13498 ; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448
13499 ; AVX512BW-NEXT: kmovd %eax, %k3
13500 ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
13501 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
13502 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
13503 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
13504 ; AVX512BW-NEXT: vporq %xmm5, %xmm0, %xmm19
13505 ; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm17
13506 ; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0
13507 ; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
13508 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
13509 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
13510 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13511 ; AVX512BW-NEXT: movw $3968, %ax # imm = 0xF80
13512 ; AVX512BW-NEXT: kmovd %eax, %k7
13513 ; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7}
13514 ; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm15
13515 ; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm5
13516 ; AVX512BW-NEXT: movw $4644, %ax # imm = 0x1224
13517 ; AVX512BW-NEXT: kmovd %eax, %k4
13518 ; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
13519 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22
13520 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
13521 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
13522 ; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20
13523 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
13524 ; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000
13525 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
13526 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23
13527 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
13528 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
13529 ; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22
13530 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13531 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
13532 ; AVX512BW-NEXT: kmovd %edi, %k1
13533 ; AVX512BW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1}
13534 ; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
13535 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
13536 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22
13537 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
13538 ; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22
13539 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14
13540 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
13541 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
13542 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
13543 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2
13544 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
13545 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
13546 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13547 ; AVX512BW-NEXT: vporq %xmm14, %xmm22, %xmm14
13548 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2
13549 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5}
13550 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6}
13551 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
13552 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
13553 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
13554 ; AVX512BW-NEXT: vpor %xmm2, %xmm14, %xmm2
13555 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
13556 ; AVX512BW-NEXT: kmovd %edi, %k5
13557 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13558 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
13559 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22
13560 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
13561 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
13562 ; AVX512BW-NEXT: vporq %xmm22, %xmm14, %xmm14
13563 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
13564 ; AVX512BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21
13565 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
13566 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
13567 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3
13568 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
13569 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13570 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13571 ; AVX512BW-NEXT: vporq %xmm14, %xmm21, %xmm14
13572 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22
13573 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1}
13574 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
13575 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
13576 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
13577 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
13578 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13579 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13580 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
13581 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18
13582 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
13583 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
13584 ; AVX512BW-NEXT: vporq %xmm18, %xmm3, %xmm3
13585 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13586 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
13587 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13588 ; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18
13589 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
13590 ; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
13591 ; AVX512BW-NEXT: kmovd %edi, %k2
13592 ; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
13593 ; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2}
13594 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
13595 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13596 ; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18
13597 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18
13598 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1}
13599 ; AVX512BW-NEXT: kmovd %eax, %k2
13600 ; AVX512BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2}
13601 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
13602 ; AVX512BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
13603 ; AVX512BW-NEXT: kmovq %rax, %k1
13604 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1}
13605 ; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
13606 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
13607 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
13608 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
13609 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13610 ; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
13611 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13612 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
13613 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13614 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
13615 ; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
13616 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
13617 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
13618 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
13619 ; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
13620 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13621 ; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
13622 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13623 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1}
13624 ; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
13625 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
13626 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
13627 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
13628 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13629 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
13630 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13631 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
13632 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
13633 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
13634 ; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
13635 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
13636 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
13637 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
13638 ; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
13639 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13640 ; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
13641 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13642 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1}
13643 ; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
13644 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
13645 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
13646 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
13647 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13648 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
13649 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13650 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
13651 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
13652 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
13653 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
13654 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm14
13655 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
13656 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
13657 ; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
13658 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13659 ; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
13660 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
13661 ; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1}
13662 ; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13663 ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
13664 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
13665 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
13666 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
13667 ; AVX512BW-NEXT: vporq %xmm3, %xmm2, %xmm19
13668 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
13669 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
13670 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
13671 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
13672 ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7}
13673 ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
13674 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
13675 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
13676 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
13677 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13678 ; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
13679 ; AVX512BW-NEXT: kmovq %k1, %k7
13680 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
13681 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
13682 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
13683 ; AVX512BW-NEXT: movl $8176, %eax # imm = 0x1FF0
13684 ; AVX512BW-NEXT: kmovd %eax, %k1
13685 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1}
13686 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
13687 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
13688 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
13689 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
13690 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13691 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13692 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2}
13693 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
13694 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
13695 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
13696 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
13697 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
13698 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13699 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2}
13700 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6}
13701 ; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6}
13702 ; AVX512BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4}
13703 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
13704 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
13705 ; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6}
13706 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
13707 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12
13708 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u]
13709 ; AVX512BW-NEXT: vpor %xmm0, %xmm12, %xmm0
13710 ; AVX512BW-NEXT: movl $4186112, %eax # imm = 0x3FE000
13711 ; AVX512BW-NEXT: kmovd %eax, %k1
13712 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
13713 ; AVX512BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7}
13714 ; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4}
13715 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
13716 ; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3}
13717 ; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4}
13718 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u]
13719 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
13720 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
13721 ; AVX512BW-NEXT: vpor %xmm2, %xmm10, %xmm2
13722 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13723 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
13724 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
13725 ; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
13726 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
13727 ; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
13728 ; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1}
13729 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
13730 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
13731 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
13732 ; AVX512BW-NEXT: vpor %xmm3, %xmm10, %xmm3
13733 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13734 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
13735 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
13736 ; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
13737 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
13738 ; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1}
13739 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm10
13740 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
13741 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
13742 ; AVX512BW-NEXT: vpor %xmm6, %xmm10, %xmm6
13743 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
13744 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
13745 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
13746 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
13747 ; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
13748 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1}
13749 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
13750 ; AVX512BW-NEXT: vpermw %zmm25, %zmm7, %zmm7
13751 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
13752 ; AVX512BW-NEXT: vpermw %zmm25, %zmm8, %zmm8
13753 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
13754 ; AVX512BW-NEXT: vpermw %zmm25, %zmm10, %zmm10
13755 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm11
13756 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
13757 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
13758 ; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
13759 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13760 ; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
13761 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm12
13762 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13763 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
13764 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2
13765 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5}
13766 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
13767 ; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00
13768 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm12
13769 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
13770 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
13771 ; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12
13772 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13773 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13774 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13775 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
13776 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
13777 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5}
13778 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
13779 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
13780 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
13781 ; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1
13782 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13783 ; AVX512BW-NEXT: vpshufb %xmm10, %xmm26, %xmm3
13784 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13785 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13786 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
13787 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5}
13788 ; AVX512BW-NEXT: kmovd %eax, %k1
13789 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1}
13790 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2
13791 ; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1}
13792 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm2
13793 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
13794 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
13795 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
13796 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
13797 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
13798 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
13799 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
13800 ; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
13801 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
13802 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
13803 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi)
13804 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
13805 ; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx)
13806 ; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8)
13807 ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9)
13808 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdi)
13809 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
13810 ; AVX512BW-NEXT: vzeroupper
13811 ; AVX512BW-NEXT: retq
13813 ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
13814 ; AVX512BW-FCP: # %bb.0:
13815 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
13816 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
13817 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
13818 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16
13819 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
13820 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24
13821 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
13822 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17
13823 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
13824 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25
13825 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
13826 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18
13827 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
13828 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm9
13829 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
13830 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm6
13831 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
13832 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4
13833 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
13834 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
13835 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
13836 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
13837 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1}
13838 ; AVX512BW-FCP-NEXT: kmovq %k1, %k2
13839 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13840 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
13841 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
13842 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
13843 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1
13844 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
13845 ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
13846 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
13847 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
13848 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
13849 ; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
13850 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
13851 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
13852 ; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1}
13853 ; AVX512BW-FCP-NEXT: kmovq %k1, %k3
13854 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
13855 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13856 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
13857 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
13858 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
13859 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
13860 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
13861 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19
13862 ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm5
13863 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
13864 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
13865 ; AVX512BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm7
13866 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u]
13867 ; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8
13868 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
13869 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
13870 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4
13871 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
13872 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5
13873 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5}
13874 ; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5
13875 ; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
13876 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
13877 ; AVX512BW-FCP-NEXT: kmovd %eax, %k6
13878 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6}
13879 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u]
13880 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
13881 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u]
13882 ; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm10, %xmm21
13883 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
13884 ; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
13885 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7
13886 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7}
13887 ; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm10
13888 ; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
13889 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
13890 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4
13891 ; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4}
13892 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22
13893 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
13894 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
13895 ; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20
13896 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22
13897 ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
13898 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
13899 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23
13900 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
13901 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u]
13902 ; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
13903 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
13904 ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
13905 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1
13906 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1}
13907 ; AVX512BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6}
13908 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u]
13909 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
13910 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
13911 ; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
13912 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15
13913 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6]
13914 ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20
13915 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
13916 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
13917 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
13918 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
13919 ; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15
13920 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
13921 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5}
13922 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3}
13923 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
13924 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
13925 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
13926 ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15
13927 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
13928 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k5
13929 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13930 ; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2}
13931 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20
13932 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u]
13933 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
13934 ; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14
13935 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
13936 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6]
13937 ; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19
13938 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
13939 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
13940 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
13941 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
13942 ; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14
13943 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14
13944 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1}
13945 ; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6}
13946 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
13947 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
13948 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
13949 ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
13950 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
13951 ; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4}
13952 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
13953 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u]
13954 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u]
13955 ; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
13956 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
13957 ; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19
13958 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12]
13959 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20
13960 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
13961 ; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
13962 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23
13963 ; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
13964 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
13965 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
13966 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2}
13967 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
13968 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
13969 ; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
13970 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15
13971 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1}
13972 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
13973 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
13974 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0
13975 ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
13976 ; AVX512BW-FCP-NEXT: kmovq %rax, %k2
13977 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
13978 ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4}
13979 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21
13980 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u]
13981 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
13982 ; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0
13983 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
13984 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7}
13985 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
13986 ; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1}
13987 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
13988 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
13989 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
13990 ; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18
13991 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
13992 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3}
13993 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
13994 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2}
13995 ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1}
13996 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
13997 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
13998 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
13999 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14000 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
14001 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7}
14002 ; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6}
14003 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
14004 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
14005 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
14006 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
14007 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
14008 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3}
14009 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14010 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2}
14011 ; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6}
14012 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u]
14013 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14014 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u]
14015 ; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0
14016 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
14017 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7}
14018 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4}
14019 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
14020 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12]
14021 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
14022 ; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
14023 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
14024 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3}
14025 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14026 ; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2}
14027 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
14028 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17
14029 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
14030 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
14031 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
14032 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k2}
14033 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18
14034 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u]
14035 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
14036 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16
14037 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
14038 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7}
14039 ; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4}
14040 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
14041 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
14042 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u]
14043 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14044 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
14045 ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
14046 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
14047 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1}
14048 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14049 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1}
14050 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
14051 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13]
14052 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
14053 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14054 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14055 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3}
14056 ; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6}
14057 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
14058 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14059 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
14060 ; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14061 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14062 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3}
14063 ; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm0 {%k4}
14064 ; AVX512BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1}
14065 ; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm21 {%k6}
14066 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6}
14067 ; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1}
14068 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4}
14069 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
14070 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
14071 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
14072 ; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
14073 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
14074 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13]
14075 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
14076 ; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
14077 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14078 ; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
14079 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k3}
14080 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[u,u,3,10],zero,zero,zero,xmm21[6,13],zero,zero,xmm21[u,u,u,u,u]
14081 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21
14082 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[1,8,15],zero,zero,xmm21[4,11,u,u,u,u,u]
14083 ; AVX512BW-FCP-NEXT: vporq %xmm13, %xmm21, %xmm13
14084 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14085 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
14086 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14]
14087 ; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21
14088 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
14089 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm13 {%k3}
14090 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm11, %xmm21
14091 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u]
14092 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
14093 ; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm11, %xmm11
14094 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21
14095 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14096 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15]
14097 ; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm19, %xmm11
14098 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14099 ; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm21 {%k3}
14100 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
14101 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm19
14102 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
14103 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm20
14104 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
14105 ; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm11
14106 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
14107 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
14108 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
14109 ; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm0, %xmm2
14110 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14111 ; AVX512BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14112 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm11
14113 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14114 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3]
14115 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11
14116 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5}
14117 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm11
14118 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
14119 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
14120 ; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
14121 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14122 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14123 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14124 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3]
14125 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
14126 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm11 {%k5}
14127 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
14128 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
14129 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
14130 ; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3
14131 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14132 ; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
14133 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14134 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
14135 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm0
14136 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k5}
14137 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2}
14138 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
14139 ; AVX512BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
14140 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7
14141 ; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
14142 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
14143 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
14144 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
14145 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
14146 ; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
14147 ; AVX512BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000
14148 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
14149 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14150 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
14151 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm2 {%k1}
14152 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0
14153 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1}
14154 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0
14155 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
14156 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
14157 ; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0
14158 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14159 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
14160 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
14161 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5
14162 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
14163 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
14164 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14165 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
14166 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
14167 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
14168 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
14169 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
14170 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
14171 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdi)
14172 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
14173 ; AVX512BW-FCP-NEXT: vzeroupper
14174 ; AVX512BW-FCP-NEXT: retq
14176 ; AVX512DQ-BW-LABEL: load_i8_stride7_vf64:
14177 ; AVX512DQ-BW: # %bb.0:
14178 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm25
14179 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
14180 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm18
14181 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
14182 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm24
14183 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
14184 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm10
14185 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
14186 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm0
14187 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9
14188 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1
14189 ; AVX512DQ-BW-NEXT: movw $-28382, %ax # imm = 0x9122
14190 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
14191 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1}
14192 ; AVX512DQ-BW-NEXT: kmovq %k1, %k2
14193 ; AVX512DQ-BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14194 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
14195 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
14196 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
14197 ; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm16
14198 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
14199 ; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0
14200 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
14201 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1}
14202 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11
14203 ; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6
14204 ; AVX512DQ-BW-NEXT: movw $8772, %ax # imm = 0x2244
14205 ; AVX512DQ-BW-NEXT: kmovd %eax, %k6
14206 ; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
14207 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3
14208 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
14209 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
14210 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0
14211 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14212 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7
14213 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14214 ; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
14215 ; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm8
14216 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
14217 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
14218 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14219 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
14220 ; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm26
14221 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
14222 ; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm4
14223 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14224 ; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm12, %xmm5
14225 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0
14226 ; AVX512DQ-BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
14227 ; AVX512DQ-BW-NEXT: kmovq %rax, %k5
14228 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5}
14229 ; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm13
14230 ; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm12
14231 ; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448
14232 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
14233 ; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
14234 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
14235 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0
14236 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
14237 ; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm0, %xmm19
14238 ; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm17
14239 ; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm0
14240 ; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
14241 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
14242 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
14243 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
14244 ; AVX512DQ-BW-NEXT: movw $3968, %ax # imm = 0xF80
14245 ; AVX512DQ-BW-NEXT: kmovd %eax, %k7
14246 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7}
14247 ; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %ymm15
14248 ; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm5
14249 ; AVX512DQ-BW-NEXT: movw $4644, %ax # imm = 0x1224
14250 ; AVX512DQ-BW-NEXT: kmovd %eax, %k4
14251 ; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
14252 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22
14253 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
14254 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
14255 ; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20
14256 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
14257 ; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000
14258 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
14259 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23
14260 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
14261 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
14262 ; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22
14263 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
14264 ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
14265 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
14266 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1}
14267 ; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
14268 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
14269 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22
14270 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
14271 ; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22
14272 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14
14273 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
14274 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
14275 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
14276 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2
14277 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
14278 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
14279 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14280 ; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm22, %xmm14
14281 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2
14282 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5}
14283 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6}
14284 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
14285 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
14286 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
14287 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm14, %xmm2
14288 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
14289 ; AVX512DQ-BW-NEXT: kmovd %edi, %k5
14290 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14291 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
14292 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22
14293 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
14294 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
14295 ; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm14, %xmm14
14296 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
14297 ; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21
14298 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
14299 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
14300 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3
14301 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
14302 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14303 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
14304 ; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm21, %xmm14
14305 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23
14306 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1}
14307 ; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
14308 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
14309 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
14310 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
14311 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14312 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14313 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
14314 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18
14315 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
14316 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
14317 ; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm3, %xmm3
14318 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14319 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
14320 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
14321 ; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18
14322 ; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
14323 ; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
14324 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
14325 ; AVX512DQ-BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14326 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2}
14327 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
14328 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
14329 ; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18
14330 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18
14331 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1}
14332 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
14333 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2}
14334 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
14335 ; AVX512DQ-BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
14336 ; AVX512DQ-BW-NEXT: kmovq %rax, %k1
14337 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1}
14338 ; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
14339 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
14340 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
14341 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
14342 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14343 ; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
14344 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14345 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
14346 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
14347 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
14348 ; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
14349 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
14350 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
14351 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
14352 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
14353 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14354 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
14355 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14356 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1}
14357 ; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
14358 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
14359 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
14360 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
14361 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14362 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
14363 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14364 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
14365 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
14366 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
14367 ; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
14368 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
14369 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
14370 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
14371 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
14372 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14373 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
14374 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14375 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1}
14376 ; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
14377 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
14378 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
14379 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
14380 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14381 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
14382 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14383 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
14384 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
14385 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
14386 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
14387 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm14
14388 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
14389 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
14390 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
14391 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14392 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
14393 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
14394 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1}
14395 ; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14396 ; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
14397 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
14398 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
14399 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
14400 ; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm2, %xmm19
14401 ; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
14402 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
14403 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
14404 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
14405 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7}
14406 ; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
14407 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
14408 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
14409 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
14410 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14411 ; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
14412 ; AVX512DQ-BW-NEXT: kmovq %k1, %k7
14413 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
14414 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
14415 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
14416 ; AVX512DQ-BW-NEXT: movl $8176, %eax # imm = 0x1FF0
14417 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
14418 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1}
14419 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
14420 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
14421 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
14422 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
14423 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14424 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14425 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2}
14426 ; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
14427 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
14428 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
14429 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
14430 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
14431 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14432 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2}
14433 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6}
14434 ; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6}
14435 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4}
14436 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
14437 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
14438 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6}
14439 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
14440 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm3
14441 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
14442 ; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0
14443 ; AVX512DQ-BW-NEXT: movl $4186112, %eax # imm = 0x3FE000
14444 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
14445 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14446 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7}
14447 ; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4}
14448 ; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
14449 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3}
14450 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4}
14451 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u]
14452 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm11
14453 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
14454 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9
14455 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
14456 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
14457 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
14458 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
14459 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14460 ; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
14461 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1}
14462 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
14463 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
14464 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
14465 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm11, %xmm3
14466 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14467 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
14468 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
14469 ; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
14470 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14471 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1}
14472 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm11
14473 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
14474 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
14475 ; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm11, %xmm6
14476 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
14477 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14478 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
14479 ; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
14480 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14481 ; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1}
14482 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
14483 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7
14484 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
14485 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm8, %zmm8
14486 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
14487 ; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm11, %zmm11
14488 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm12
14489 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
14490 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
14491 ; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm12, %xmm2
14492 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14493 ; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14494 ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm12
14495 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14496 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
14497 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9
14498 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5}
14499 ; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00
14500 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm9
14501 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
14502 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
14503 ; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm12, %xmm9
14504 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14505 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14506 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14507 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
14508 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
14509 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5}
14510 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
14511 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1
14512 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
14513 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1
14514 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14515 ; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm26, %xmm3
14516 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14517 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14518 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
14519 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5}
14520 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
14521 ; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1}
14522 ; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1}
14523 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm3
14524 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
14525 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
14526 ; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
14527 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14528 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
14529 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
14530 ; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
14531 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
14532 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
14533 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi)
14534 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx)
14535 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%rcx)
14536 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r8)
14537 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9)
14538 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdi)
14539 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
14540 ; AVX512DQ-BW-NEXT: vzeroupper
14541 ; AVX512DQ-BW-NEXT: retq
14543 ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
14544 ; AVX512DQ-BW-FCP: # %bb.0:
14545 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
14546 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
14547 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
14548 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm16
14549 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
14550 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm24
14551 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
14552 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm17
14553 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
14554 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm25
14555 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
14556 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm18
14557 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
14558 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm9
14559 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
14560 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm1, %zmm6
14561 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
14562 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4
14563 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
14564 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
14565 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
14566 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
14567 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm1 {%k1}
14568 ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2
14569 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14570 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
14571 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
14572 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
14573 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1
14574 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
14575 ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
14576 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
14577 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
14578 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
14579 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
14580 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
14581 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
14582 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm4 {%k1}
14583 ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3
14584 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
14585 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
14586 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
14587 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
14588 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
14589 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
14590 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
14591 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm19
14592 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm5, %ymm5
14593 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
14594 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
14595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm7
14596 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u,u,u,u,u]
14597 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %xmm8
14598 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
14599 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm10, %xmm5
14600 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4
14601 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
14602 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
14603 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5}
14604 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5
14605 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
14606 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
14607 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
14608 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm10 {%k6}
14609 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm10[u,u,u,u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u]
14610 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
14611 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u]
14612 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm10, %xmm21
14613 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
14614 ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
14615 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7
14616 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7}
14617 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm10
14618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
14619 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
14620 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
14621 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm20 {%k4}
14622 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm22
14623 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
14624 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
14625 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm20, %xmm20
14626 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm22
14627 ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
14628 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm20 {%k4}
14629 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm23
14630 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
14631 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[1,8,15],zero,zero,xmm20[4,11],zero,zero,xmm20[u,u,u,u,u,u,u]
14632 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
14633 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
14634 ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
14635 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1
14636 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm9 {%k1}
14637 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm13, %ymm11, %ymm20 {%k6}
14638 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[u,u,u,6,13],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u]
14639 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20
14640 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u],zero,zero,xmm20[4,11],zero,zero,xmm20[0,7,14,u,u,u,u]
14641 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm20, %xmm20
14642 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm15
14643 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,4,6]
14644 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm20
14645 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm20[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
14646 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
14647 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u,u,u,u,u]
14648 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
14649 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm20, %xmm15
14650 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
14651 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm9 {%k5}
14652 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm14 {%k3}
14653 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
14654 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
14655 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
14656 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm15
14657 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
14658 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5
14659 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14660 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm14 {%k2}
14661 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm20
14662 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,zero,xmm20[5,12],zero,zero,xmm20[1,8,15,u,u,u,u]
14663 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
14664 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm14, %xmm14
14665 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
14666 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,1,3,5,6]
14667 ; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm20, %ymm19
14668 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm19[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
14669 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
14670 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
14671 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm7[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
14672 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm19, %xmm14
14673 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm14
14674 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1}
14675 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm12, %ymm0 {%k6}
14676 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
14677 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14678 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
14679 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
14680 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14681 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm15 {%k4}
14682 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
14683 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u]
14684 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u]
14685 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
14686 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
14687 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm19
14688 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[5,12]
14689 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm20
14690 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm20[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
14691 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
14692 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23
14693 ; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
14694 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
14695 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14696 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm15 {%k2}
14697 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u,u,u,u,u,u]
14698 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = zero,zero,xmm7[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
14699 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm24, %xmm23
14700 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm23, %zmm15, %zmm15
14701 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm15 {%k1}
14702 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
14703 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
14704 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm0
14705 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
14706 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
14707 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
14708 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k4}
14709 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm21
14710 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u],zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u]
14711 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
14712 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm0, %xmm0
14713 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
14714 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm0 {%k7}
14715 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14716 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm18 {%k1}
14717 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero
14718 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
14719 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10]
14720 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18
14721 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
14722 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm0 {%k3}
14723 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14724 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm9 {%k2}
14725 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k1}
14726 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
14727 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u]
14728 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
14729 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14730 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
14731 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm0 {%k7}
14732 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm6, %ymm17 {%k6}
14733 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
14734 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
14735 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
14736 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
14737 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
14738 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm0 {%k3}
14739 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14740 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm14 {%k2}
14741 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm0 {%k6}
14742 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm0[u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u]
14743 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14744 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u]
14745 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm0, %xmm0
14746 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
14747 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm0 {%k7}
14748 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm16 {%k4}
14749 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
14750 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12]
14751 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero
14752 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
14753 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
14754 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm0 {%k3}
14755 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14756 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm15 {%k2}
14757 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
14758 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm17
14759 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
14760 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm0, %zmm0
14761 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14762 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k1}
14763 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18
14764 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[1,8,15,u,u]
14765 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
14766 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16
14767 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
14768 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm0, %ymm16 {%k7}
14769 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm0 {%k4}
14770 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
14771 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
14772 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u]
14773 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14774 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
14775 ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
14776 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
14777 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k1}
14778 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
14779 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k1}
14780 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm0, %xmm18
14781 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13]
14782 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
14783 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14784 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14785 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm16 {%k3}
14786 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm10, %ymm0 {%k6}
14787 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
14788 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14789 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
14790 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm0, %xmm0
14791 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14792 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm0, %ymm17 {%k3}
14793 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm21 {%k4}
14794 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm12, %ymm3, %ymm18 {%k1}
14795 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm0 {%k6}
14796 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm12, %ymm3 {%k6}
14797 ; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm13, %ymm12 {%k1}
14798 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm13, %ymm11 {%k4}
14799 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
14800 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
14801 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
14802 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
14803 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
14804 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13]
14805 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm20[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
14806 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
14807 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14808 ; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
14809 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k2}
14810 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u]
14811 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
14812 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u]
14813 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm13, %xmm0
14814 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14815 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm20[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
14816 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[0,7,14]
14817 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm13, %xmm22, %xmm13
14818 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
14819 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm0 {%k2}
14820 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
14821 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u]
14822 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
14823 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm11, %xmm11
14824 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13
14825 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
14826 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm19[1,8,15]
14827 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm19, %xmm11
14828 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14829 ; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm13 {%k2}
14830 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
14831 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm19
14832 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
14833 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm20
14834 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
14835 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm11
14836 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm2
14837 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
14838 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[4,11],zero,zero,xmm21[0,7,14],zero,zero,xmm21[u,u,u,u,u,u,u]
14839 ; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm21, %xmm2
14840 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14841 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
14842 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm8, %xmm11
14843 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14844 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm22[0],xmm11[1],xmm22[1],xmm11[2],xmm22[2],xmm11[3],xmm22[3]
14845 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11
14846 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm2 {%k5}
14847 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm11
14848 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
14849 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
14850 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
14851 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14852 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14853 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14854 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm18[0],xmm12[0],xmm18[1],xmm12[1],xmm18[2],xmm12[2],xmm18[3],xmm12[3]
14855 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0
14856 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm11 {%k5}
14857 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
14858 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
14859 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
14860 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
14861 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 {%k5} = ymm19[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14862 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm3
14863 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14864 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
14865 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm13, %zmm3
14866 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k5}
14867 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
14868 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k2}
14869 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
14870 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm26, %zmm3, %zmm3
14871 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
14872 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
14873 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
14874 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
14875 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
14876 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
14877 ; AVX512DQ-BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000
14878 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
14879 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
14880 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
14881 ; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm2 {%k1}
14882 ; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm11 {%k1}
14883 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm3
14884 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
14885 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero
14886 ; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
14887 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14888 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
14889 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47]
14890 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm5
14891 ; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm0 {%k1}
14892 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14893 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
14894 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
14895 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
14896 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
14897 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
14898 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
14899 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%rdi)
14900 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
14901 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
14902 ; AVX512DQ-BW-FCP-NEXT: retq
14903 %wide.vec = load <448 x i8>, ptr %in.vec, align 64
14904 %strided.vec0 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
14905 %strided.vec1 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
14906 %strided.vec2 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
14907 %strided.vec3 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
14908 %strided.vec4 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
14909 %strided.vec5 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
14910 %strided.vec6 = shufflevector <448 x i8> %wide.vec, <448 x i8> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
14911 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
14912 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
14913 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
14914 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
14915 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
14916 store <64 x i8> %strided.vec5, ptr %out.vec5, align 64
14917 store <64 x i8> %strided.vec6, ptr %out.vec6, align 64