1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-ONLY,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512DQ-ONLY,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512DQBW-ONLY,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
19 ; SSE-LABEL: load_i8_stride6_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movdqa (%rdi), %xmm1
23 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
24 ; SSE-NEXT: pand %xmm1, %xmm3
25 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
26 ; SSE-NEXT: packuswb %xmm2, %xmm2
27 ; SSE-NEXT: pxor %xmm4, %xmm4
28 ; SSE-NEXT: movdqa %xmm1, %xmm0
29 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
30 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,3,2,3]
31 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
32 ; SSE-NEXT: packuswb %xmm5, %xmm5
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
34 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7]
35 ; SSE-NEXT: packuswb %xmm6, %xmm6
36 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
37 ; SSE-NEXT: movdqa %xmm0, %xmm4
38 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
39 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
40 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
41 ; SSE-NEXT: packuswb %xmm4, %xmm4
42 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
43 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
44 ; SSE-NEXT: packuswb %xmm3, %xmm3
45 ; SSE-NEXT: psrlq $48, %xmm1
46 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
47 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
48 ; SSE-NEXT: packuswb %xmm0, %xmm0
49 ; SSE-NEXT: movd %xmm2, %edi
50 ; SSE-NEXT: movw %di, (%rsi)
51 ; SSE-NEXT: movd %xmm5, %esi
52 ; SSE-NEXT: movw %si, (%rdx)
53 ; SSE-NEXT: movd %xmm6, %edx
54 ; SSE-NEXT: movw %dx, (%rcx)
55 ; SSE-NEXT: movd %xmm4, %ecx
56 ; SSE-NEXT: movw %cx, (%r8)
57 ; SSE-NEXT: movd %xmm3, %ecx
58 ; SSE-NEXT: movw %cx, (%r9)
59 ; SSE-NEXT: movd %xmm0, %ecx
60 ; SSE-NEXT: movw %cx, (%rax)
63 ; AVX-LABEL: load_i8_stride6_vf2:
65 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
66 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
67 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
68 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
69 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
70 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
71 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
72 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
73 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
74 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
75 ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx)
76 ; AVX-NEXT: vpextrw $0, %xmm4, (%r8)
77 ; AVX-NEXT: vpextrw $0, %xmm5, (%r9)
78 ; AVX-NEXT: vpextrw $0, %xmm0, (%rax)
80 %wide.vec = load <12 x i8>, ptr %in.vec, align 64
81 %strided.vec0 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 0, i32 6>
82 %strided.vec1 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 1, i32 7>
83 %strided.vec2 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 2, i32 8>
84 %strided.vec3 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 3, i32 9>
85 %strided.vec4 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 4, i32 10>
86 %strided.vec5 = shufflevector <12 x i8> %wide.vec, <12 x i8> poison, <2 x i32> <i32 5, i32 11>
87 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
88 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
89 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
90 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
91 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
92 store <2 x i8> %strided.vec5, ptr %out.vec5, align 64
96 define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
97 ; SSE-LABEL: load_i8_stride6_vf4:
99 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
100 ; SSE-NEXT: movdqa (%rdi), %xmm5
101 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
102 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
103 ; SSE-NEXT: movdqa %xmm5, %xmm2
104 ; SSE-NEXT: pand %xmm0, %xmm2
105 ; SSE-NEXT: pandn %xmm1, %xmm0
106 ; SSE-NEXT: por %xmm2, %xmm0
107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
108 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935]
109 ; SSE-NEXT: pand %xmm2, %xmm0
110 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
111 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
112 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
113 ; SSE-NEXT: packuswb %xmm0, %xmm0
114 ; SSE-NEXT: pxor %xmm3, %xmm3
115 ; SSE-NEXT: movdqa %xmm5, %xmm7
116 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
117 ; SSE-NEXT: pandn %xmm1, %xmm4
118 ; SSE-NEXT: movdqa %xmm1, %xmm6
119 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm5[0,0]
120 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[2,3]
121 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,65535,65535,0,65535,65535]
122 ; SSE-NEXT: pand %xmm8, %xmm1
123 ; SSE-NEXT: pandn %xmm5, %xmm8
124 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
125 ; SSE-NEXT: movdqa %xmm5, %xmm9
126 ; SSE-NEXT: psrld $16, %xmm9
127 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
128 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
129 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
130 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
131 ; SSE-NEXT: packuswb %xmm7, %xmm7
132 ; SSE-NEXT: por %xmm7, %xmm4
133 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7]
134 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
135 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
136 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
137 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
138 ; SSE-NEXT: packuswb %xmm7, %xmm7
139 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
140 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,2],xmm5[0,3]
141 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0,1,3]
142 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
143 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
144 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
145 ; SSE-NEXT: packuswb %xmm6, %xmm6
146 ; SSE-NEXT: por %xmm1, %xmm8
147 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0]
148 ; SSE-NEXT: pand %xmm2, %xmm1
149 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
150 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
151 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
152 ; SSE-NEXT: packuswb %xmm1, %xmm1
153 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3],xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
154 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[0,0]
155 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[2,3]
156 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7]
157 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
158 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
159 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
160 ; SSE-NEXT: packuswb %xmm2, %xmm2
161 ; SSE-NEXT: movd %xmm0, (%rsi)
162 ; SSE-NEXT: movd %xmm4, (%rdx)
163 ; SSE-NEXT: movd %xmm7, (%rcx)
164 ; SSE-NEXT: movd %xmm6, (%r8)
165 ; SSE-NEXT: movd %xmm1, (%r9)
166 ; SSE-NEXT: movd %xmm2, (%rax)
169 ; AVX-LABEL: load_i8_stride6_vf4:
171 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
172 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
173 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
174 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm1[2,u,u,u,u,u,u,u,u,u,u,u,u]
175 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,6,12],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
176 ; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
177 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[3,u,u,u,u,u,u,u,u,u,u,u,u]
178 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,7,13],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
179 ; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
180 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[4,u,u,u,u,u,u,u,u,u,u,u,u]
181 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[2,8,14],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
182 ; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
183 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[5,u,u,u,u,u,u,u,u,u,u,u,u]
184 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[3,9,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
185 ; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
186 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
187 ; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
188 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
189 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
190 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
191 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
192 ; AVX-NEXT: vmovd %xmm2, (%rsi)
193 ; AVX-NEXT: vmovd %xmm3, (%rdx)
194 ; AVX-NEXT: vmovd %xmm4, (%rcx)
195 ; AVX-NEXT: vmovd %xmm5, (%r8)
196 ; AVX-NEXT: vmovd %xmm6, (%r9)
197 ; AVX-NEXT: vmovd %xmm0, (%rax)
199 %wide.vec = load <24 x i8>, ptr %in.vec, align 64
200 %strided.vec0 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 0, i32 6, i32 12, i32 18>
201 %strided.vec1 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 1, i32 7, i32 13, i32 19>
202 %strided.vec2 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 2, i32 8, i32 14, i32 20>
203 %strided.vec3 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 3, i32 9, i32 15, i32 21>
204 %strided.vec4 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 4, i32 10, i32 16, i32 22>
205 %strided.vec5 = shufflevector <24 x i8> %wide.vec, <24 x i8> poison, <4 x i32> <i32 5, i32 11, i32 17, i32 23>
206 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
207 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
208 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
209 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
210 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
211 store <4 x i8> %strided.vec5, ptr %out.vec5, align 64
215 define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
216 ; SSE-LABEL: load_i8_stride6_vf8:
218 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
219 ; SSE-NEXT: movdqa (%rdi), %xmm4
220 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
221 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
222 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,65535,65535,0,65535,65535,0]
223 ; SSE-NEXT: movdqa %xmm4, %xmm1
224 ; SSE-NEXT: pand %xmm8, %xmm1
225 ; SSE-NEXT: pandn %xmm3, %xmm8
226 ; SSE-NEXT: por %xmm1, %xmm8
227 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
228 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935]
229 ; SSE-NEXT: pand %xmm5, %xmm1
230 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
231 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
232 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
233 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,7,6,7]
234 ; SSE-NEXT: packuswb %xmm6, %xmm6
235 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
236 ; SSE-NEXT: pand %xmm1, %xmm6
237 ; SSE-NEXT: movdqa %xmm0, %xmm7
238 ; SSE-NEXT: pand %xmm5, %xmm7
239 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,2,1]
240 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5,6,5]
241 ; SSE-NEXT: packuswb %xmm9, %xmm9
242 ; SSE-NEXT: movdqa %xmm1, %xmm2
243 ; SSE-NEXT: pandn %xmm9, %xmm2
244 ; SSE-NEXT: por %xmm6, %xmm2
245 ; SSE-NEXT: pxor %xmm6, %xmm6
246 ; SSE-NEXT: movdqa %xmm8, %xmm9
247 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
248 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,1,0,3]
249 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
250 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
251 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,65535]
252 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
253 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
254 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
255 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,0,4,5,6,7]
256 ; SSE-NEXT: pand %xmm10, %xmm8
257 ; SSE-NEXT: pandn %xmm9, %xmm10
258 ; SSE-NEXT: por %xmm8, %xmm10
259 ; SSE-NEXT: packuswb %xmm10, %xmm10
260 ; SSE-NEXT: pand %xmm1, %xmm10
261 ; SSE-NEXT: movdqa %xmm0, %xmm8
262 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
263 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
264 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[2,2,3,3]
265 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
266 ; SSE-NEXT: packuswb %xmm9, %xmm9
267 ; SSE-NEXT: pandn %xmm9, %xmm1
268 ; SSE-NEXT: por %xmm10, %xmm1
269 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,0,65535,65535,0,65535,65535]
270 ; SSE-NEXT: movdqa %xmm11, %xmm9
271 ; SSE-NEXT: pandn %xmm3, %xmm9
272 ; SSE-NEXT: movdqa %xmm4, %xmm12
273 ; SSE-NEXT: pand %xmm11, %xmm12
274 ; SSE-NEXT: por %xmm9, %xmm12
275 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7]
276 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
277 ; SSE-NEXT: pand %xmm5, %xmm9
278 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
279 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
280 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
281 ; SSE-NEXT: packuswb %xmm13, %xmm13
282 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,255,255,255,255,255,255,255,255]
283 ; SSE-NEXT: pand %xmm9, %xmm13
284 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm7[0,3,2,3,4,5,6,7]
285 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
286 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm10[0,1,2,3,4,4,5,6]
287 ; SSE-NEXT: packuswb %xmm14, %xmm14
288 ; SSE-NEXT: movdqa %xmm9, %xmm10
289 ; SSE-NEXT: pandn %xmm14, %xmm10
290 ; SSE-NEXT: por %xmm13, %xmm10
291 ; SSE-NEXT: movdqa %xmm12, %xmm13
292 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
293 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
294 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,1,4,5,6,7]
295 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [0,65535,65535,0,65535,65535,65535,65535]
296 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
297 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
298 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
299 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
300 ; SSE-NEXT: pand %xmm14, %xmm12
301 ; SSE-NEXT: pandn %xmm13, %xmm14
302 ; SSE-NEXT: por %xmm12, %xmm14
303 ; SSE-NEXT: packuswb %xmm14, %xmm14
304 ; SSE-NEXT: pand %xmm9, %xmm14
305 ; SSE-NEXT: movdqa %xmm8, %xmm12
306 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[3,0]
307 ; SSE-NEXT: movaps %xmm0, %xmm13
308 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm12[0,2]
309 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm13[0,1,2,3,7,5,6,7]
310 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,1,0,2]
311 ; SSE-NEXT: packuswb %xmm13, %xmm13
312 ; SSE-NEXT: movdqa %xmm9, %xmm12
313 ; SSE-NEXT: pandn %xmm13, %xmm12
314 ; SSE-NEXT: por %xmm14, %xmm12
315 ; SSE-NEXT: pand %xmm11, %xmm3
316 ; SSE-NEXT: pandn %xmm4, %xmm11
317 ; SSE-NEXT: por %xmm3, %xmm11
318 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0]
319 ; SSE-NEXT: pand %xmm5, %xmm3
320 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
321 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
322 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
323 ; SSE-NEXT: packuswb %xmm4, %xmm4
324 ; SSE-NEXT: pand %xmm9, %xmm4
325 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,7,6,7]
326 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,1,0,2]
327 ; SSE-NEXT: packuswb %xmm5, %xmm5
328 ; SSE-NEXT: movdqa %xmm9, %xmm3
329 ; SSE-NEXT: pandn %xmm5, %xmm3
330 ; SSE-NEXT: por %xmm4, %xmm3
331 ; SSE-NEXT: movdqa %xmm11, %xmm4
332 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
333 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
334 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
335 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
336 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
337 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm11[0,1,2,3,7,5,6,7]
338 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
339 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
340 ; SSE-NEXT: pand %xmm5, %xmm6
341 ; SSE-NEXT: pandn %xmm4, %xmm5
342 ; SSE-NEXT: por %xmm6, %xmm5
343 ; SSE-NEXT: packuswb %xmm5, %xmm5
344 ; SSE-NEXT: pand %xmm9, %xmm5
345 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
346 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
347 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
348 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
349 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,7]
350 ; SSE-NEXT: packuswb %xmm0, %xmm0
351 ; SSE-NEXT: pandn %xmm0, %xmm9
352 ; SSE-NEXT: por %xmm5, %xmm9
353 ; SSE-NEXT: movq %xmm2, (%rsi)
354 ; SSE-NEXT: movq %xmm1, (%rdx)
355 ; SSE-NEXT: movq %xmm10, (%rcx)
356 ; SSE-NEXT: movq %xmm12, (%r8)
357 ; SSE-NEXT: movq %xmm3, (%r9)
358 ; SSE-NEXT: movq %xmm9, (%rax)
361 ; AVX1-ONLY-LABEL: load_i8_stride6_vf8:
362 ; AVX1-ONLY: # %bb.0:
363 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
364 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
365 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
366 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0
367 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u]
368 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
369 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
370 ; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4
371 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
372 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[4,10,u,u,u,u,u,u,u,u]
373 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3
374 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
375 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
376 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
377 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
378 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,xmm0[5,11,u,u,u,u,u,u,u,u]
379 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4
380 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
381 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
382 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
383 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128]
384 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
385 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
386 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
387 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm5, %xmm5
388 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
389 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
390 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
391 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm7, %xmm7
392 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[1,7,13,u,u,u,u,u,u,u,u]
393 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
394 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
395 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
396 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8
397 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm8, %xmm8
398 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
399 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8
400 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
401 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
402 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
403 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1
404 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
405 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
406 ; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi)
407 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rdx)
408 ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rcx)
409 ; AVX1-ONLY-NEXT: vmovq %xmm7, (%r8)
410 ; AVX1-ONLY-NEXT: vmovq %xmm8, (%r9)
411 ; AVX1-ONLY-NEXT: vmovq %xmm0, (%rax)
412 ; AVX1-ONLY-NEXT: retq
414 ; AVX2-LABEL: load_i8_stride6_vf8:
416 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
417 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
418 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
419 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
420 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
421 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u,u,u]
422 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u]
423 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
424 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u,u,u]
425 ; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u]
426 ; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
427 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
428 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
429 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
430 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[2,8,14],zero,zero,xmm3[0,6,12,u,u,u,u,u,u,u,u]
431 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
432 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u]
433 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[3,9,15],zero,zero,xmm3[1,7,13,u,u,u,u,u,u,u,u]
434 ; AVX2-NEXT: vpor %xmm5, %xmm3, %xmm3
435 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7],ymm0[8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14],ymm1[15]
436 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
437 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
438 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14,u,u,u,u,u,u,u,u]
439 ; AVX2-NEXT: vpor %xmm5, %xmm7, %xmm5
440 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u]
441 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15,u,u,u,u,u,u,u,u]
442 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
443 ; AVX2-NEXT: vmovq %xmm4, (%rsi)
444 ; AVX2-NEXT: vmovq %xmm2, (%rdx)
445 ; AVX2-NEXT: vmovq %xmm6, (%rcx)
446 ; AVX2-NEXT: vmovq %xmm3, (%r8)
447 ; AVX2-NEXT: vmovq %xmm5, (%r9)
448 ; AVX2-NEXT: vmovq %xmm0, (%rax)
449 ; AVX2-NEXT: vzeroupper
451 %wide.vec = load <48 x i8>, ptr %in.vec, align 64
452 %strided.vec0 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42>
453 %strided.vec1 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43>
454 %strided.vec2 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44>
455 %strided.vec3 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45>
456 %strided.vec4 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46>
457 %strided.vec5 = shufflevector <48 x i8> %wide.vec, <48 x i8> poison, <8 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47>
458 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
459 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
460 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
461 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
462 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
463 store <8 x i8> %strided.vec5, ptr %out.vec5, align 64
467 define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
468 ; SSE-LABEL: load_i8_stride6_vf16:
470 ; SSE-NEXT: movdqa 64(%rdi), %xmm10
471 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
472 ; SSE-NEXT: movdqa (%rdi), %xmm5
473 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
474 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
475 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
476 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
477 ; SSE-NEXT: movdqa %xmm4, %xmm0
478 ; SSE-NEXT: pandn %xmm7, %xmm0
479 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
480 ; SSE-NEXT: movdqa %xmm2, %xmm3
481 ; SSE-NEXT: pandn %xmm6, %xmm3
482 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
483 ; SSE-NEXT: movdqa %xmm4, %xmm3
484 ; SSE-NEXT: pandn %xmm6, %xmm3
485 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
486 ; SSE-NEXT: pand %xmm4, %xmm6
487 ; SSE-NEXT: por %xmm0, %xmm6
488 ; SSE-NEXT: movdqa %xmm6, %xmm0
489 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
490 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
491 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
492 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
493 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
494 ; SSE-NEXT: packuswb %xmm3, %xmm0
495 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
496 ; SSE-NEXT: movdqa %xmm8, %xmm9
497 ; SSE-NEXT: pandn %xmm0, %xmm9
498 ; SSE-NEXT: movdqa %xmm2, %xmm0
499 ; SSE-NEXT: movdqa %xmm2, %xmm11
500 ; SSE-NEXT: pandn %xmm1, %xmm11
501 ; SSE-NEXT: pand %xmm4, %xmm10
502 ; SSE-NEXT: movdqa %xmm4, %xmm2
503 ; SSE-NEXT: pandn %xmm1, %xmm2
504 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
505 ; SSE-NEXT: movdqa %xmm1, %xmm2
506 ; SSE-NEXT: movdqa %xmm5, %xmm14
507 ; SSE-NEXT: pand %xmm4, %xmm14
508 ; SSE-NEXT: movdqa 80(%rdi), %xmm3
509 ; SSE-NEXT: movdqa %xmm3, %xmm13
510 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
511 ; SSE-NEXT: pand %xmm4, %xmm13
512 ; SSE-NEXT: movdqa %xmm7, %xmm15
513 ; SSE-NEXT: pand %xmm4, %xmm7
514 ; SSE-NEXT: pand %xmm4, %xmm2
515 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
516 ; SSE-NEXT: movdqa %xmm4, %xmm12
517 ; SSE-NEXT: movdqa %xmm4, %xmm2
518 ; SSE-NEXT: pandn %xmm5, %xmm4
519 ; SSE-NEXT: pand %xmm0, %xmm5
520 ; SSE-NEXT: por %xmm11, %xmm5
521 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3]
522 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
523 ; SSE-NEXT: pand %xmm1, %xmm11
524 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7]
525 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
526 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7]
527 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7]
528 ; SSE-NEXT: packuswb %xmm0, %xmm0
529 ; SSE-NEXT: pand %xmm8, %xmm0
530 ; SSE-NEXT: por %xmm9, %xmm0
531 ; SSE-NEXT: pandn %xmm3, %xmm12
532 ; SSE-NEXT: por %xmm12, %xmm10
533 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0]
534 ; SSE-NEXT: pand %xmm1, %xmm9
535 ; SSE-NEXT: movdqa %xmm1, %xmm3
536 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
537 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0]
538 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
539 ; SSE-NEXT: packuswb %xmm9, %xmm9
540 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
541 ; SSE-NEXT: movdqa %xmm11, %xmm12
542 ; SSE-NEXT: pandn %xmm9, %xmm12
543 ; SSE-NEXT: pand %xmm11, %xmm0
544 ; SSE-NEXT: por %xmm0, %xmm12
545 ; SSE-NEXT: pxor %xmm9, %xmm9
546 ; SSE-NEXT: movdqa %xmm6, %xmm0
547 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
548 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
549 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
550 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
551 ; SSE-NEXT: psrld $16, %xmm0
552 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
553 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
554 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
555 ; SSE-NEXT: packuswb %xmm6, %xmm1
556 ; SSE-NEXT: movdqa %xmm5, %xmm0
557 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
558 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
559 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
560 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
561 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
562 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
563 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
564 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
565 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,65535,0,0,65535,65535]
566 ; SSE-NEXT: pand %xmm6, %xmm5
567 ; SSE-NEXT: pandn %xmm0, %xmm6
568 ; SSE-NEXT: por %xmm5, %xmm6
569 ; SSE-NEXT: packuswb %xmm6, %xmm6
570 ; SSE-NEXT: pand %xmm8, %xmm6
571 ; SSE-NEXT: pandn %xmm1, %xmm8
572 ; SSE-NEXT: por %xmm8, %xmm6
573 ; SSE-NEXT: movdqa %xmm10, %xmm0
574 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
575 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
576 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
577 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
578 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
579 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
580 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535]
581 ; SSE-NEXT: pand %xmm5, %xmm1
582 ; SSE-NEXT: pandn %xmm0, %xmm5
583 ; SSE-NEXT: por %xmm1, %xmm5
584 ; SSE-NEXT: packuswb %xmm5, %xmm0
585 ; SSE-NEXT: movdqa %xmm11, %xmm10
586 ; SSE-NEXT: pandn %xmm0, %xmm10
587 ; SSE-NEXT: pand %xmm11, %xmm6
588 ; SSE-NEXT: por %xmm6, %xmm10
589 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
590 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
591 ; SSE-NEXT: movdqa %xmm15, %xmm0
592 ; SSE-NEXT: pand %xmm3, %xmm0
593 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
594 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
595 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
596 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
598 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
599 ; SSE-NEXT: packuswb %xmm1, %xmm0
600 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
601 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,1,2,3,4,5,6,7]
602 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
603 ; SSE-NEXT: pand %xmm3, %xmm1
604 ; SSE-NEXT: movdqa %xmm3, %xmm8
605 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
606 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
607 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
608 ; SSE-NEXT: packuswb %xmm1, %xmm1
609 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
610 ; SSE-NEXT: movdqa %xmm3, %xmm5
611 ; SSE-NEXT: pandn %xmm1, %xmm5
612 ; SSE-NEXT: pand %xmm3, %xmm0
613 ; SSE-NEXT: por %xmm0, %xmm5
614 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
615 ; SSE-NEXT: pandn %xmm6, %xmm2
616 ; SSE-NEXT: por %xmm2, %xmm13
617 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,3,2,3,4,5,6,7]
618 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
619 ; SSE-NEXT: pand %xmm8, %xmm0
620 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
621 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
622 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
623 ; SSE-NEXT: packuswb %xmm0, %xmm0
624 ; SSE-NEXT: movdqa %xmm11, %xmm8
625 ; SSE-NEXT: pandn %xmm0, %xmm8
626 ; SSE-NEXT: pand %xmm11, %xmm5
627 ; SSE-NEXT: por %xmm5, %xmm8
628 ; SSE-NEXT: movdqa %xmm15, %xmm0
629 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
630 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
631 ; SSE-NEXT: movdqa %xmm15, %xmm1
632 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
633 ; SSE-NEXT: movaps %xmm0, %xmm2
634 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
635 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
636 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
637 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7]
638 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
639 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
640 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
641 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
642 ; SSE-NEXT: packuswb %xmm0, %xmm1
643 ; SSE-NEXT: movdqa %xmm14, %xmm0
644 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
645 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
646 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
647 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
648 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,3,2,1]
649 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
650 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
651 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535]
652 ; SSE-NEXT: pand %xmm5, %xmm2
653 ; SSE-NEXT: pandn %xmm0, %xmm5
654 ; SSE-NEXT: por %xmm2, %xmm5
655 ; SSE-NEXT: pand %xmm3, %xmm1
656 ; SSE-NEXT: packuswb %xmm5, %xmm5
657 ; SSE-NEXT: pandn %xmm5, %xmm3
658 ; SSE-NEXT: por %xmm1, %xmm3
659 ; SSE-NEXT: movdqa %xmm13, %xmm0
660 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
661 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
662 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
663 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
664 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3]
665 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
666 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
667 ; SSE-NEXT: pand %xmm2, %xmm1
668 ; SSE-NEXT: pandn %xmm0, %xmm2
669 ; SSE-NEXT: por %xmm1, %xmm2
670 ; SSE-NEXT: pand %xmm11, %xmm3
671 ; SSE-NEXT: packuswb %xmm2, %xmm0
672 ; SSE-NEXT: pandn %xmm0, %xmm11
673 ; SSE-NEXT: por %xmm3, %xmm11
674 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
675 ; SSE-NEXT: movdqa %xmm7, %xmm0
676 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
677 ; SSE-NEXT: pand %xmm2, %xmm0
678 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
679 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
680 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
681 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
682 ; SSE-NEXT: packuswb %xmm1, %xmm0
683 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
684 ; SSE-NEXT: movdqa %xmm3, %xmm1
685 ; SSE-NEXT: pandn %xmm0, %xmm1
686 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
687 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
688 ; SSE-NEXT: pand %xmm2, %xmm0
689 ; SSE-NEXT: movdqa %xmm2, %xmm5
690 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
691 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
692 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7]
693 ; SSE-NEXT: packuswb %xmm2, %xmm2
694 ; SSE-NEXT: pand %xmm3, %xmm2
695 ; SSE-NEXT: por %xmm1, %xmm2
696 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0]
697 ; SSE-NEXT: movdqa %xmm6, %xmm1
698 ; SSE-NEXT: pand %xmm13, %xmm1
699 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
700 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
701 ; SSE-NEXT: pand %xmm0, %xmm2
702 ; SSE-NEXT: por %xmm1, %xmm13
703 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
704 ; SSE-NEXT: pand %xmm5, %xmm1
705 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
706 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
707 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
708 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
709 ; SSE-NEXT: packuswb %xmm1, %xmm1
710 ; SSE-NEXT: movdqa %xmm0, %xmm6
711 ; SSE-NEXT: pandn %xmm1, %xmm6
712 ; SSE-NEXT: por %xmm2, %xmm6
713 ; SSE-NEXT: movdqa %xmm7, %xmm1
714 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
715 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
716 ; SSE-NEXT: movdqa %xmm7, %xmm2
717 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0]
718 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3]
719 ; SSE-NEXT: psrlq $48, %xmm1
720 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
721 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
722 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
723 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
724 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
725 ; SSE-NEXT: packuswb %xmm2, %xmm1
726 ; SSE-NEXT: movdqa %xmm4, %xmm2
727 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
728 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
729 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,5,5,5,5]
730 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,65535]
731 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
732 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
733 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
734 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,1,2,4,5,6,7]
735 ; SSE-NEXT: pand %xmm2, %xmm4
736 ; SSE-NEXT: pandn %xmm5, %xmm2
737 ; SSE-NEXT: por %xmm4, %xmm2
738 ; SSE-NEXT: packuswb %xmm2, %xmm2
739 ; SSE-NEXT: pand %xmm3, %xmm2
740 ; SSE-NEXT: pandn %xmm1, %xmm3
741 ; SSE-NEXT: por %xmm3, %xmm2
742 ; SSE-NEXT: movdqa %xmm13, %xmm1
743 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
744 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
745 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
746 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
747 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,0]
748 ; SSE-NEXT: pand %xmm3, %xmm1
749 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,7]
750 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
751 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
752 ; SSE-NEXT: pandn %xmm4, %xmm3
753 ; SSE-NEXT: por %xmm1, %xmm3
754 ; SSE-NEXT: pand %xmm0, %xmm2
755 ; SSE-NEXT: packuswb %xmm3, %xmm1
756 ; SSE-NEXT: pandn %xmm1, %xmm0
757 ; SSE-NEXT: por %xmm2, %xmm0
758 ; SSE-NEXT: movdqa %xmm12, (%rsi)
759 ; SSE-NEXT: movdqa %xmm10, (%rdx)
760 ; SSE-NEXT: movdqa %xmm8, (%rcx)
761 ; SSE-NEXT: movdqa %xmm11, (%r8)
762 ; SSE-NEXT: movdqa %xmm6, (%r9)
763 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
764 ; SSE-NEXT: movdqa %xmm0, (%rax)
767 ; AVX1-ONLY-LABEL: load_i8_stride6_vf16:
768 ; AVX1-ONLY: # %bb.0:
769 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
770 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
771 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
772 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm0
773 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
774 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,4,10,u,u,u,u,u,u,u,u,u,u,u,u]
775 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,0,6,12,u,u,u,u,u,u,u,u,u]
776 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
777 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[2,8,14,u,u,u,u,u,u,u,u,u,u]
778 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[0,6,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
779 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
780 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
781 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
782 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[4,10]
783 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
784 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
785 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
786 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
787 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm6, %xmm7, %xmm6
788 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,5,11,u,u,u,u,u,u,u,u,u,u,u,u]
789 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,u,u,u,1,7,13,u,u,u,u,u,u,u,u,u]
790 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
791 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[3,9,15,u,u,u,u,u,u,u,u,u,u]
792 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[1,7,13],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u]
793 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8
794 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
795 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
796 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
797 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8
798 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm7, %xmm8, %xmm7
799 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
800 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
801 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm10[0],xmm8[0]
802 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
803 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
804 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10
805 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
806 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm8, %xmm10, %xmm8
807 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
808 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12]
809 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm12, %xmm10
810 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm8, %xmm10, %xmm8
811 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
812 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
813 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm12[0],xmm10[0]
814 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm2[5,11,u,u,u,u,u,u,u,u,u,u,u]
815 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
816 ; AVX1-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
817 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm10, %xmm12, %xmm10
818 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
819 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[1,7,13]
820 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
821 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm10, %xmm11, %xmm9
822 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[4,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
823 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm2[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
824 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10
825 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
826 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
827 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
828 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
829 ; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
830 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
831 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
832 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11
833 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
834 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
835 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
836 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
837 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
838 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
839 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
840 ; AVX1-ONLY-NEXT: vpblendvb %xmm12, %xmm1, %xmm0, %xmm0
841 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[3,9,15]
842 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
843 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
844 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
845 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rsi)
846 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%rdx)
847 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rcx)
848 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, (%r8)
849 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, (%r9)
850 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rax)
851 ; AVX1-ONLY-NEXT: retq
853 ; AVX2-ONLY-LABEL: load_i8_stride6_vf16:
854 ; AVX2-ONLY: # %bb.0:
855 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
856 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3
857 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm4
858 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
859 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
860 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
861 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm6
862 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
863 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm2
864 ; AVX2-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
865 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[4,10]
866 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1
867 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
868 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
869 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
870 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm7, %xmm2
871 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
872 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
873 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
874 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,11]
875 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
876 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
877 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
878 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
879 ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
880 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm6, %xmm7
881 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
882 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[2,8,14],zero,zero,xmm6[0,6,12],zero,zero,zero,xmm6[u,u,u,u,u]
883 ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
884 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
885 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,6,12]
886 ; AVX2-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10
887 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm9, %xmm10, %xmm9
888 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[3,9,15,u,u,u,u,u]
889 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[3,9,15],zero,zero,xmm6[1,7,13],zero,zero,zero,xmm6[u,u,u,u,u]
890 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6
891 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
892 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
893 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7
894 ; AVX2-ONLY-NEXT: vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
895 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
896 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
897 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm3, %xmm4
898 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
899 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[4,10],zero,zero,zero,xmm3[2,8,14],zero,zero,xmm3[u,u,u,u,u,u]
900 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
901 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,8,14]
902 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
903 ; AVX2-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8
904 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
905 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
906 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,11],zero,zero,zero,xmm3[3,9,15],zero,zero,xmm3[u,u,u,u,u,u]
907 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm3, %xmm3
908 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,9,15]
909 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
910 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
911 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
912 ; AVX2-ONLY-NEXT: vmovdqa %xmm2, (%rsi)
913 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rdx)
914 ; AVX2-ONLY-NEXT: vmovdqa %xmm9, (%rcx)
915 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, (%r8)
916 ; AVX2-ONLY-NEXT: vmovdqa %xmm7, (%r9)
917 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rax)
918 ; AVX2-ONLY-NEXT: vzeroupper
919 ; AVX2-ONLY-NEXT: retq
921 ; AVX512F-LABEL: load_i8_stride6_vf16:
923 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
924 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
925 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm3
926 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4
927 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm5
928 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5
929 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
930 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
931 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[2,8,14],zero,zero,xmm6[0,6,12,u,u,u,u,u]
932 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm7
933 ; AVX512F-NEXT: vmovdqa 80(%rdi), %xmm2
934 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[4,10]
935 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm1
936 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
937 ; AVX512F-NEXT: vpor %xmm8, %xmm9, %xmm8
938 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
939 ; AVX512F-NEXT: vpternlogq $184, %xmm7, %xmm9, %xmm8
940 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero,xmm5[u,u,u,u,u]
941 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[3,9,15],zero,zero,xmm6[1,7,13,u,u,u,u,u]
942 ; AVX512F-NEXT: vpor %xmm5, %xmm6, %xmm5
943 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[5,11]
944 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
945 ; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6
946 ; AVX512F-NEXT: vpternlogq $184, %xmm5, %xmm9, %xmm6
947 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
948 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,6,12]
949 ; AVX512F-NEXT: vpor %xmm5, %xmm7, %xmm5
950 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
951 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm7
952 ; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm10
953 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
954 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[2,8,14],zero,zero,xmm7[0,6,12],zero,zero,zero,xmm7[u,u,u,u,u]
955 ; AVX512F-NEXT: vpor %xmm11, %xmm12, %xmm11
956 ; AVX512F-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm11
957 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
958 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,7,13]
959 ; AVX512F-NEXT: vpor %xmm5, %xmm12, %xmm5
960 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
961 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,9,15],zero,zero,xmm7[1,7,13],zero,zero,zero,xmm7[u,u,u,u,u]
962 ; AVX512F-NEXT: vpor %xmm7, %xmm10, %xmm7
963 ; AVX512F-NEXT: vpternlogq $226, %xmm5, %xmm9, %xmm7
964 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm0
965 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
966 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
967 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[2,8,14]
968 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
969 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
970 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
971 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm9[5,6,7]
972 ; AVX512F-NEXT: vpor %xmm4, %xmm5, %xmm4
973 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
974 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[3,9,15]
975 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
976 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
977 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
978 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
979 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
980 ; AVX512F-NEXT: vmovdqa %xmm8, (%rsi)
981 ; AVX512F-NEXT: vmovdqa %xmm6, (%rdx)
982 ; AVX512F-NEXT: vmovdqa %xmm11, (%rcx)
983 ; AVX512F-NEXT: vmovdqa %xmm7, (%r8)
984 ; AVX512F-NEXT: vmovdqa %xmm4, (%r9)
985 ; AVX512F-NEXT: vmovdqa %xmm0, (%rax)
986 ; AVX512F-NEXT: vzeroupper
989 ; AVX512BW-LABEL: load_i8_stride6_vf16:
991 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
992 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
993 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
994 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924
995 ; AVX512BW-NEXT: kmovd %r10d, %k1
996 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
997 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,6,12],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[u,u,u,u,u]
998 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm4
999 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[2,8,14],zero,zero,xmm4[0,6,12,u,u,u,u,u]
1000 ; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm3
1001 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm5
1002 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[4,10]
1003 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm7
1004 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
1005 ; AVX512BW-NEXT: vpor %xmm6, %xmm8, %xmm6
1006 ; AVX512BW-NEXT: movw $-2048, %di # imm = 0xF800
1007 ; AVX512BW-NEXT: kmovd %edi, %k2
1008 ; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm3 {%k2}
1009 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,7,13],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[u,u,u,u,u]
1010 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[3,9,15],zero,zero,xmm4[1,7,13,u,u,u,u,u]
1011 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2
1012 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,11]
1013 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1014 ; AVX512BW-NEXT: vpor %xmm4, %xmm6, %xmm4
1015 ; AVX512BW-NEXT: vmovdqu8 %xmm4, %xmm2 {%k2}
1016 ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492
1017 ; AVX512BW-NEXT: kmovd %edi, %k3
1018 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm4 {%k3}
1019 ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm6
1020 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14,u,u,u,u,u]
1021 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[u,u,u,u,u]
1022 ; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8
1023 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
1024 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,6,12]
1025 ; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9
1026 ; AVX512BW-NEXT: vmovdqu8 %xmm9, %xmm8 {%k2}
1027 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15,u,u,u,u,u]
1028 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[u,u,u,u,u]
1029 ; AVX512BW-NEXT: vpor %xmm6, %xmm4, %xmm4
1030 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u,u,5,11],zero,zero,zero
1031 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,7,13]
1032 ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6
1033 ; AVX512BW-NEXT: vmovdqu8 %xmm6, %xmm4 {%k2}
1034 ; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
1035 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0
1036 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
1037 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[2,8,14]
1038 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm9[5,6,7]
1039 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[4,10],zero,zero,zero,xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u]
1040 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
1041 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
1042 ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6
1043 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,7,13],zero,zero,zero,xmm0[5,11,u,u,u,u,u,u]
1044 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[3,9,15]
1045 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
1046 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,11],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[u,u,u,u,u,u]
1047 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,1,7,13],zero,zero,zero
1048 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
1049 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
1050 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi)
1051 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx)
1052 ; AVX512BW-NEXT: vmovdqa %xmm8, (%rcx)
1053 ; AVX512BW-NEXT: vmovdqa %xmm4, (%r8)
1054 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1055 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rax)
1056 ; AVX512BW-NEXT: vzeroupper
1057 ; AVX512BW-NEXT: retq
1058 %wide.vec = load <96 x i8>, ptr %in.vec, align 64
1059 %strided.vec0 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90>
1060 %strided.vec1 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91>
1061 %strided.vec2 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92>
1062 %strided.vec3 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93>
1063 %strided.vec4 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94>
1064 %strided.vec5 = shufflevector <96 x i8> %wide.vec, <96 x i8> poison, <16 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95>
1065 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
1066 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
1067 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
1068 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
1069 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
1070 store <16 x i8> %strided.vec5, ptr %out.vec5, align 64
1074 define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
1075 ; SSE-LABEL: load_i8_stride6_vf32:
1077 ; SSE-NEXT: subq $264, %rsp # imm = 0x108
1078 ; SSE-NEXT: movdqa 64(%rdi), %xmm7
1079 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1080 ; SSE-NEXT: movdqa 80(%rdi), %xmm9
1081 ; SSE-NEXT: movdqa (%rdi), %xmm12
1082 ; SSE-NEXT: movdqa 16(%rdi), %xmm14
1083 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
1084 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1085 ; SSE-NEXT: movdqa 48(%rdi), %xmm5
1086 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535]
1087 ; SSE-NEXT: movdqa %xmm10, %xmm0
1088 ; SSE-NEXT: pandn %xmm1, %xmm0
1089 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
1090 ; SSE-NEXT: movdqa %xmm11, %xmm1
1091 ; SSE-NEXT: pandn %xmm5, %xmm1
1092 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1093 ; SSE-NEXT: movdqa %xmm10, %xmm1
1094 ; SSE-NEXT: pandn %xmm5, %xmm1
1095 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1096 ; SSE-NEXT: movdqa %xmm5, %xmm15
1097 ; SSE-NEXT: pand %xmm10, %xmm15
1098 ; SSE-NEXT: por %xmm0, %xmm15
1099 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
1100 ; SSE-NEXT: movdqa %xmm15, %xmm0
1101 ; SSE-NEXT: pand %xmm1, %xmm0
1102 ; SSE-NEXT: movdqa %xmm1, %xmm3
1103 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
1104 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1105 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1106 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
1107 ; SSE-NEXT: packuswb %xmm1, %xmm0
1108 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
1109 ; SSE-NEXT: movdqa %xmm11, %xmm1
1110 ; SSE-NEXT: pandn %xmm14, %xmm1
1111 ; SSE-NEXT: movdqa %xmm12, %xmm8
1112 ; SSE-NEXT: pand %xmm11, %xmm8
1113 ; SSE-NEXT: por %xmm1, %xmm8
1114 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
1115 ; SSE-NEXT: pand %xmm3, %xmm1
1116 ; SSE-NEXT: movdqa %xmm3, %xmm6
1117 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1118 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1119 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
1120 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
1121 ; SSE-NEXT: packuswb %xmm1, %xmm1
1122 ; SSE-NEXT: pand %xmm2, %xmm1
1123 ; SSE-NEXT: movdqa %xmm2, %xmm3
1124 ; SSE-NEXT: movdqa %xmm2, %xmm5
1125 ; SSE-NEXT: pandn %xmm0, %xmm3
1126 ; SSE-NEXT: por %xmm3, %xmm1
1127 ; SSE-NEXT: movdqa %xmm10, %xmm0
1128 ; SSE-NEXT: pandn %xmm9, %xmm0
1129 ; SSE-NEXT: pand %xmm10, %xmm7
1130 ; SSE-NEXT: por %xmm0, %xmm7
1131 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
1132 ; SSE-NEXT: pand %xmm6, %xmm0
1133 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1134 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
1135 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1136 ; SSE-NEXT: packuswb %xmm0, %xmm0
1137 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1138 ; SSE-NEXT: movdqa %xmm3, %xmm2
1139 ; SSE-NEXT: pandn %xmm0, %xmm2
1140 ; SSE-NEXT: pand %xmm3, %xmm1
1141 ; SSE-NEXT: por %xmm1, %xmm2
1142 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1143 ; SSE-NEXT: movdqa 128(%rdi), %xmm1
1144 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1145 ; SSE-NEXT: movdqa %xmm10, %xmm0
1146 ; SSE-NEXT: pandn %xmm1, %xmm0
1147 ; SSE-NEXT: movdqa 144(%rdi), %xmm1
1148 ; SSE-NEXT: movdqa %xmm11, %xmm2
1149 ; SSE-NEXT: pandn %xmm1, %xmm2
1150 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1151 ; SSE-NEXT: movdqa %xmm10, %xmm2
1152 ; SSE-NEXT: pandn %xmm1, %xmm2
1153 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1154 ; SSE-NEXT: movdqa %xmm1, %xmm2
1155 ; SSE-NEXT: pand %xmm10, %xmm2
1156 ; SSE-NEXT: por %xmm0, %xmm2
1157 ; SSE-NEXT: movdqa %xmm2, %xmm0
1158 ; SSE-NEXT: pand %xmm6, %xmm0
1159 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
1160 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
1161 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
1162 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
1163 ; SSE-NEXT: packuswb %xmm3, %xmm0
1164 ; SSE-NEXT: movdqa %xmm5, %xmm6
1165 ; SSE-NEXT: pandn %xmm0, %xmm6
1166 ; SSE-NEXT: movdqa %xmm10, %xmm1
1167 ; SSE-NEXT: movdqa %xmm10, %xmm0
1168 ; SSE-NEXT: pandn %xmm12, %xmm0
1169 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1170 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
1171 ; SSE-NEXT: movdqa %xmm11, %xmm3
1172 ; SSE-NEXT: pandn %xmm0, %xmm3
1173 ; SSE-NEXT: movdqa 160(%rdi), %xmm5
1174 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1175 ; SSE-NEXT: pand %xmm10, %xmm5
1176 ; SSE-NEXT: movdqa %xmm10, %xmm4
1177 ; SSE-NEXT: pandn %xmm14, %xmm4
1178 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1179 ; SSE-NEXT: pand %xmm10, %xmm12
1180 ; SSE-NEXT: movdqa %xmm11, %xmm4
1181 ; SSE-NEXT: pandn %xmm9, %xmm4
1182 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1183 ; SSE-NEXT: movdqa %xmm9, %xmm11
1184 ; SSE-NEXT: pand %xmm10, %xmm11
1185 ; SSE-NEXT: movdqa %xmm10, %xmm4
1186 ; SSE-NEXT: pandn %xmm0, %xmm4
1187 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1188 ; SSE-NEXT: movdqa 96(%rdi), %xmm13
1189 ; SSE-NEXT: movdqa %xmm13, %xmm4
1190 ; SSE-NEXT: pand %xmm10, %xmm4
1191 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1192 ; SSE-NEXT: movdqa 176(%rdi), %xmm4
1193 ; SSE-NEXT: movdqa %xmm4, %xmm10
1194 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1195 ; SSE-NEXT: pand %xmm1, %xmm10
1196 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1197 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1198 ; SSE-NEXT: movdqa %xmm9, %xmm10
1199 ; SSE-NEXT: pand %xmm1, %xmm9
1200 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1201 ; SSE-NEXT: pand %xmm1, %xmm14
1202 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1203 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1204 ; SSE-NEXT: movdqa %xmm14, %xmm9
1205 ; SSE-NEXT: pand %xmm1, %xmm14
1206 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1207 ; SSE-NEXT: pand %xmm1, %xmm0
1208 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1209 ; SSE-NEXT: movdqa %xmm1, %xmm14
1210 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1211 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
1212 ; SSE-NEXT: pandn %xmm13, %xmm1
1213 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1214 ; SSE-NEXT: movdqa %xmm13, %xmm1
1215 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1216 ; SSE-NEXT: por %xmm3, %xmm1
1217 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,1,3]
1218 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
1219 ; SSE-NEXT: pand %xmm0, %xmm3
1220 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
1221 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
1222 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
1223 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
1224 ; SSE-NEXT: packuswb %xmm3, %xmm3
1225 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1226 ; SSE-NEXT: por %xmm6, %xmm3
1227 ; SSE-NEXT: pandn %xmm4, %xmm14
1228 ; SSE-NEXT: por %xmm14, %xmm5
1229 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0]
1230 ; SSE-NEXT: pand %xmm0, %xmm4
1231 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
1232 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
1233 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
1234 ; SSE-NEXT: packuswb %xmm4, %xmm4
1235 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1236 ; SSE-NEXT: movdqa %xmm13, %xmm0
1237 ; SSE-NEXT: pandn %xmm4, %xmm0
1238 ; SSE-NEXT: pand %xmm13, %xmm3
1239 ; SSE-NEXT: por %xmm3, %xmm0
1240 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1241 ; SSE-NEXT: pxor %xmm4, %xmm4
1242 ; SSE-NEXT: movdqa %xmm15, %xmm3
1243 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
1244 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
1245 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3]
1246 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3]
1247 ; SSE-NEXT: psrld $16, %xmm3
1248 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3]
1249 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7]
1250 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
1251 ; SSE-NEXT: packuswb %xmm15, %xmm14
1252 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
1253 ; SSE-NEXT: movdqa %xmm6, %xmm3
1254 ; SSE-NEXT: pandn %xmm14, %xmm3
1255 ; SSE-NEXT: movdqa %xmm8, %xmm14
1256 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
1257 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[2,1,0,3]
1258 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
1259 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
1260 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535]
1261 ; SSE-NEXT: movdqa %xmm15, %xmm0
1262 ; SSE-NEXT: pandn %xmm14, %xmm0
1263 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1264 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
1265 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
1266 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm8[1,3,2,0,4,5,6,7]
1267 ; SSE-NEXT: pand %xmm15, %xmm14
1268 ; SSE-NEXT: por %xmm0, %xmm14
1269 ; SSE-NEXT: packuswb %xmm14, %xmm14
1270 ; SSE-NEXT: pand %xmm6, %xmm14
1271 ; SSE-NEXT: por %xmm3, %xmm14
1272 ; SSE-NEXT: movdqa %xmm7, %xmm0
1273 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1274 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1275 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535]
1276 ; SSE-NEXT: movdqa %xmm8, %xmm3
1277 ; SSE-NEXT: pandn %xmm0, %xmm3
1278 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
1279 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7]
1280 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1281 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
1282 ; SSE-NEXT: pand %xmm8, %xmm0
1283 ; SSE-NEXT: por %xmm3, %xmm0
1284 ; SSE-NEXT: packuswb %xmm0, %xmm0
1285 ; SSE-NEXT: movdqa %xmm13, %xmm3
1286 ; SSE-NEXT: pandn %xmm0, %xmm3
1287 ; SSE-NEXT: pand %xmm13, %xmm14
1288 ; SSE-NEXT: por %xmm14, %xmm3
1289 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1290 ; SSE-NEXT: movdqa %xmm2, %xmm0
1291 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1292 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1293 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
1294 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1295 ; SSE-NEXT: psrld $16, %xmm0
1296 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
1297 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
1298 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1299 ; SSE-NEXT: packuswb %xmm2, %xmm3
1300 ; SSE-NEXT: movdqa %xmm1, %xmm0
1301 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
1302 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
1303 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
1304 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
1305 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1306 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1307 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1308 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
1309 ; SSE-NEXT: pand %xmm15, %xmm1
1310 ; SSE-NEXT: pandn %xmm0, %xmm15
1311 ; SSE-NEXT: por %xmm1, %xmm15
1312 ; SSE-NEXT: packuswb %xmm15, %xmm15
1313 ; SSE-NEXT: pand %xmm6, %xmm15
1314 ; SSE-NEXT: pandn %xmm3, %xmm6
1315 ; SSE-NEXT: por %xmm6, %xmm15
1316 ; SSE-NEXT: movdqa %xmm5, %xmm0
1317 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
1318 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1319 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1320 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7]
1321 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1322 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
1323 ; SSE-NEXT: pand %xmm8, %xmm1
1324 ; SSE-NEXT: pandn %xmm0, %xmm8
1325 ; SSE-NEXT: por %xmm1, %xmm8
1326 ; SSE-NEXT: packuswb %xmm8, %xmm0
1327 ; SSE-NEXT: movdqa %xmm13, %xmm1
1328 ; SSE-NEXT: pandn %xmm0, %xmm1
1329 ; SSE-NEXT: pand %xmm13, %xmm15
1330 ; SSE-NEXT: movdqa %xmm13, %xmm7
1331 ; SSE-NEXT: por %xmm15, %xmm1
1332 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1333 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
1334 ; SSE-NEXT: pand %xmm5, %xmm10
1335 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1336 ; SSE-NEXT: movdqa %xmm10, %xmm0
1337 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255]
1338 ; SSE-NEXT: pand %xmm15, %xmm0
1339 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
1340 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1341 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
1342 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1343 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1344 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6]
1345 ; SSE-NEXT: packuswb %xmm1, %xmm2
1346 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1347 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[2,1,2,3,4,5,6,7]
1348 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
1349 ; SSE-NEXT: pand %xmm15, %xmm0
1350 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
1351 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1352 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5]
1353 ; SSE-NEXT: packuswb %xmm1, %xmm1
1354 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
1355 ; SSE-NEXT: movdqa %xmm0, %xmm3
1356 ; SSE-NEXT: pandn %xmm1, %xmm3
1357 ; SSE-NEXT: pand %xmm0, %xmm2
1358 ; SSE-NEXT: por %xmm2, %xmm3
1359 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1360 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1361 ; SSE-NEXT: pandn %xmm14, %xmm1
1362 ; SSE-NEXT: por %xmm1, %xmm11
1363 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,3,2,3,4,5,6,7]
1364 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1365 ; SSE-NEXT: pand %xmm15, %xmm1
1366 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1367 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
1368 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
1369 ; SSE-NEXT: packuswb %xmm1, %xmm1
1370 ; SSE-NEXT: movdqa %xmm13, %xmm2
1371 ; SSE-NEXT: pandn %xmm1, %xmm2
1372 ; SSE-NEXT: pand %xmm13, %xmm3
1373 ; SSE-NEXT: por %xmm3, %xmm2
1374 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1375 ; SSE-NEXT: pand %xmm5, %xmm9
1376 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1377 ; SSE-NEXT: movdqa %xmm9, %xmm1
1378 ; SSE-NEXT: pand %xmm15, %xmm1
1379 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7]
1380 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1381 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
1382 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
1383 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1384 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
1385 ; SSE-NEXT: packuswb %xmm2, %xmm1
1386 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1387 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1388 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[2,1,2,3,4,5,6,7]
1389 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1390 ; SSE-NEXT: pand %xmm15, %xmm2
1391 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
1392 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
1393 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
1394 ; SSE-NEXT: packuswb %xmm2, %xmm2
1395 ; SSE-NEXT: movdqa %xmm0, %xmm3
1396 ; SSE-NEXT: pandn %xmm2, %xmm3
1397 ; SSE-NEXT: pand %xmm0, %xmm1
1398 ; SSE-NEXT: por %xmm1, %xmm3
1399 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
1400 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1401 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1402 ; SSE-NEXT: por %xmm1, %xmm8
1403 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7]
1404 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1405 ; SSE-NEXT: pand %xmm15, %xmm1
1406 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
1407 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
1408 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
1409 ; SSE-NEXT: packuswb %xmm1, %xmm1
1410 ; SSE-NEXT: movdqa %xmm7, %xmm2
1411 ; SSE-NEXT: pandn %xmm1, %xmm2
1412 ; SSE-NEXT: pand %xmm7, %xmm3
1413 ; SSE-NEXT: por %xmm3, %xmm2
1414 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1415 ; SSE-NEXT: movdqa %xmm10, %xmm1
1416 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1417 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
1418 ; SSE-NEXT: movdqa %xmm10, %xmm2
1419 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
1420 ; SSE-NEXT: movaps %xmm1, %xmm3
1421 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
1422 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0]
1423 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3]
1424 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
1425 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
1426 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
1428 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
1429 ; SSE-NEXT: packuswb %xmm1, %xmm2
1430 ; SSE-NEXT: movdqa %xmm12, %xmm1
1431 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1432 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1433 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,1,2,1,4,5,6,7]
1434 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
1435 ; SSE-NEXT: movdqa %xmm1, %xmm5
1436 ; SSE-NEXT: pandn %xmm3, %xmm5
1437 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
1438 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,3,2,1]
1439 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
1440 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
1441 ; SSE-NEXT: pand %xmm1, %xmm3
1442 ; SSE-NEXT: por %xmm5, %xmm3
1443 ; SSE-NEXT: packuswb %xmm3, %xmm3
1444 ; SSE-NEXT: movdqa %xmm0, %xmm5
1445 ; SSE-NEXT: pandn %xmm3, %xmm5
1446 ; SSE-NEXT: pand %xmm0, %xmm2
1447 ; SSE-NEXT: por %xmm2, %xmm5
1448 ; SSE-NEXT: movdqa %xmm11, %xmm2
1449 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
1450 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1451 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,5]
1452 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
1453 ; SSE-NEXT: movdqa %xmm2, %xmm6
1454 ; SSE-NEXT: pandn %xmm3, %xmm6
1455 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1456 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,0,3]
1457 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
1458 ; SSE-NEXT: pand %xmm2, %xmm3
1459 ; SSE-NEXT: por %xmm6, %xmm3
1460 ; SSE-NEXT: packuswb %xmm3, %xmm3
1461 ; SSE-NEXT: movdqa %xmm7, %xmm6
1462 ; SSE-NEXT: pandn %xmm3, %xmm6
1463 ; SSE-NEXT: pand %xmm7, %xmm5
1464 ; SSE-NEXT: por %xmm5, %xmm6
1465 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1466 ; SSE-NEXT: movdqa %xmm9, %xmm3
1467 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1468 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
1469 ; SSE-NEXT: movdqa %xmm9, %xmm5
1470 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
1471 ; SSE-NEXT: movaps %xmm3, %xmm6
1472 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
1473 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
1474 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
1475 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7]
1476 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
1477 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1478 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
1479 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
1480 ; SSE-NEXT: packuswb %xmm3, %xmm5
1481 ; SSE-NEXT: movdqa %xmm13, %xmm3
1482 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1483 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
1484 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
1485 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15]
1486 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm13[0,3,2,1]
1487 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
1488 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
1489 ; SSE-NEXT: pand %xmm1, %xmm6
1490 ; SSE-NEXT: pandn %xmm3, %xmm1
1491 ; SSE-NEXT: por %xmm6, %xmm1
1492 ; SSE-NEXT: pand %xmm0, %xmm5
1493 ; SSE-NEXT: packuswb %xmm1, %xmm1
1494 ; SSE-NEXT: pandn %xmm1, %xmm0
1495 ; SSE-NEXT: por %xmm5, %xmm0
1496 ; SSE-NEXT: movdqa %xmm8, %xmm1
1497 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
1498 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1499 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
1500 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
1501 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3]
1502 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
1503 ; SSE-NEXT: pand %xmm2, %xmm3
1504 ; SSE-NEXT: pandn %xmm1, %xmm2
1505 ; SSE-NEXT: por %xmm3, %xmm2
1506 ; SSE-NEXT: movdqa %xmm7, %xmm13
1507 ; SSE-NEXT: pand %xmm7, %xmm0
1508 ; SSE-NEXT: packuswb %xmm2, %xmm1
1509 ; SSE-NEXT: pandn %xmm1, %xmm13
1510 ; SSE-NEXT: por %xmm0, %xmm13
1511 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1512 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1513 ; SSE-NEXT: movdqa %xmm7, %xmm0
1514 ; SSE-NEXT: pand %xmm15, %xmm0
1515 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
1516 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
1517 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1518 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
1519 ; SSE-NEXT: packuswb %xmm1, %xmm0
1520 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
1521 ; SSE-NEXT: movdqa %xmm2, %xmm1
1522 ; SSE-NEXT: pandn %xmm0, %xmm1
1523 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1524 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1525 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0]
1526 ; SSE-NEXT: pand %xmm15, %xmm0
1527 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1528 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
1529 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7]
1530 ; SSE-NEXT: packuswb %xmm3, %xmm3
1531 ; SSE-NEXT: pand %xmm2, %xmm3
1532 ; SSE-NEXT: por %xmm1, %xmm3
1533 ; SSE-NEXT: movdqa %xmm14, %xmm11
1534 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
1535 ; SSE-NEXT: pand %xmm12, %xmm11
1536 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
1537 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3]
1538 ; SSE-NEXT: pand %xmm15, %xmm0
1539 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
1540 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1541 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
1542 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
1543 ; SSE-NEXT: packuswb %xmm0, %xmm5
1544 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
1545 ; SSE-NEXT: movdqa %xmm0, %xmm8
1546 ; SSE-NEXT: pandn %xmm5, %xmm8
1547 ; SSE-NEXT: pand %xmm0, %xmm3
1548 ; SSE-NEXT: por %xmm3, %xmm8
1549 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1550 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1551 ; SSE-NEXT: movdqa %xmm14, %xmm3
1552 ; SSE-NEXT: pand %xmm15, %xmm3
1553 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3]
1554 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
1555 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
1556 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
1557 ; SSE-NEXT: packuswb %xmm5, %xmm3
1558 ; SSE-NEXT: movdqa %xmm2, %xmm5
1559 ; SSE-NEXT: pandn %xmm3, %xmm5
1560 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1561 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1562 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[3,1,2,0]
1563 ; SSE-NEXT: pand %xmm15, %xmm3
1564 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
1565 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
1566 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[2,1,0,3,4,5,6,7]
1567 ; SSE-NEXT: packuswb %xmm6, %xmm6
1568 ; SSE-NEXT: pand %xmm2, %xmm6
1569 ; SSE-NEXT: por %xmm5, %xmm6
1570 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1571 ; SSE-NEXT: pand %xmm12, %xmm3
1572 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1573 ; SSE-NEXT: por %xmm3, %xmm12
1574 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,1,3]
1575 ; SSE-NEXT: pand %xmm15, %xmm3
1576 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
1577 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
1578 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
1579 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
1580 ; SSE-NEXT: packuswb %xmm3, %xmm5
1581 ; SSE-NEXT: movdqa %xmm0, %xmm3
1582 ; SSE-NEXT: pandn %xmm5, %xmm3
1583 ; SSE-NEXT: pand %xmm0, %xmm6
1584 ; SSE-NEXT: por %xmm6, %xmm3
1585 ; SSE-NEXT: movdqa %xmm7, %xmm5
1586 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1587 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
1588 ; SSE-NEXT: movdqa %xmm7, %xmm6
1589 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0]
1590 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3]
1591 ; SSE-NEXT: psrlq $48, %xmm5
1592 ; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1593 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1594 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
1595 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
1596 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7]
1597 ; SSE-NEXT: packuswb %xmm6, %xmm5
1598 ; SSE-NEXT: movdqa %xmm2, %xmm6
1599 ; SSE-NEXT: pandn %xmm5, %xmm6
1600 ; SSE-NEXT: movdqa %xmm9, %xmm5
1601 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
1602 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
1603 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5]
1604 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
1605 ; SSE-NEXT: movdqa %xmm5, %xmm10
1606 ; SSE-NEXT: pandn %xmm7, %xmm10
1607 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
1608 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,7,5,6,7]
1609 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
1610 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[3,1,1,2,4,5,6,7]
1611 ; SSE-NEXT: pand %xmm5, %xmm9
1612 ; SSE-NEXT: por %xmm10, %xmm9
1613 ; SSE-NEXT: packuswb %xmm9, %xmm9
1614 ; SSE-NEXT: pand %xmm2, %xmm9
1615 ; SSE-NEXT: por %xmm6, %xmm9
1616 ; SSE-NEXT: movdqa %xmm11, %xmm6
1617 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
1618 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7]
1619 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
1620 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,7,4]
1621 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0]
1622 ; SSE-NEXT: movdqa %xmm7, %xmm11
1623 ; SSE-NEXT: pandn %xmm10, %xmm11
1624 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
1625 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1]
1626 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
1627 ; SSE-NEXT: pand %xmm7, %xmm6
1628 ; SSE-NEXT: por %xmm6, %xmm11
1629 ; SSE-NEXT: packuswb %xmm11, %xmm10
1630 ; SSE-NEXT: movdqa %xmm0, %xmm6
1631 ; SSE-NEXT: pandn %xmm10, %xmm6
1632 ; SSE-NEXT: pand %xmm0, %xmm9
1633 ; SSE-NEXT: por %xmm9, %xmm6
1634 ; SSE-NEXT: movdqa %xmm14, %xmm11
1635 ; SSE-NEXT: movdqa %xmm14, %xmm9
1636 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
1637 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1638 ; SSE-NEXT: movdqa %xmm11, %xmm10
1639 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0]
1640 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3]
1641 ; SSE-NEXT: psrlq $48, %xmm9
1642 ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1643 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1644 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
1645 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
1646 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
1647 ; SSE-NEXT: packuswb %xmm10, %xmm9
1648 ; SSE-NEXT: movdqa %xmm1, %xmm10
1649 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
1650 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
1651 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
1652 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
1653 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7]
1654 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
1655 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7]
1656 ; SSE-NEXT: pand %xmm5, %xmm11
1657 ; SSE-NEXT: pandn %xmm10, %xmm5
1658 ; SSE-NEXT: por %xmm11, %xmm5
1659 ; SSE-NEXT: packuswb %xmm5, %xmm5
1660 ; SSE-NEXT: pand %xmm2, %xmm5
1661 ; SSE-NEXT: pandn %xmm9, %xmm2
1662 ; SSE-NEXT: por %xmm2, %xmm5
1663 ; SSE-NEXT: movdqa %xmm12, %xmm2
1664 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1665 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
1666 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
1667 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
1668 ; SSE-NEXT: pand %xmm7, %xmm2
1669 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,7]
1670 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
1671 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
1672 ; SSE-NEXT: pandn %xmm4, %xmm7
1673 ; SSE-NEXT: por %xmm2, %xmm7
1674 ; SSE-NEXT: pand %xmm0, %xmm5
1675 ; SSE-NEXT: packuswb %xmm7, %xmm2
1676 ; SSE-NEXT: pandn %xmm2, %xmm0
1677 ; SSE-NEXT: por %xmm5, %xmm0
1678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1679 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1681 ; SSE-NEXT: movaps %xmm2, (%rsi)
1682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1683 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
1684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1685 ; SSE-NEXT: movaps %xmm1, (%rdx)
1686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1687 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
1688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1689 ; SSE-NEXT: movaps %xmm1, (%rcx)
1690 ; SSE-NEXT: movdqa %xmm13, 16(%r8)
1691 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1692 ; SSE-NEXT: movaps %xmm1, (%r8)
1693 ; SSE-NEXT: movdqa %xmm3, 16(%r9)
1694 ; SSE-NEXT: movdqa %xmm8, (%r9)
1695 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1696 ; SSE-NEXT: movdqa %xmm0, 16(%rax)
1697 ; SSE-NEXT: movdqa %xmm6, (%rax)
1698 ; SSE-NEXT: addq $264, %rsp # imm = 0x108
1701 ; AVX1-ONLY-LABEL: load_i8_stride6_vf32:
1702 ; AVX1-ONLY: # %bb.0:
1703 ; AVX1-ONLY-NEXT: subq $120, %rsp
1704 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm9
1705 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7
1706 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6
1707 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8
1708 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
1709 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
1710 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1711 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u]
1712 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
1713 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
1714 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
1715 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
1716 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1717 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
1718 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
1719 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1720 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm7[5,11,u,u,u,u,u,u,u,u,u,u,u]
1721 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[3,9,15],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
1722 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
1723 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
1724 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1725 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
1726 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm7[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
1727 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
1728 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
1729 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
1730 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
1731 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
1732 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
1733 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1734 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
1735 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm7[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
1737 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
1738 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm5
1739 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1740 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
1741 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1742 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
1743 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
1744 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
1745 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1746 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
1747 ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
1748 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
1749 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1750 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm1
1751 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
1752 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
1753 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm13
1754 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm3
1755 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
1756 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
1757 ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
1758 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm12
1759 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm4
1760 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
1761 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
1762 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm14
1763 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm10
1764 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm10, %xmm4
1765 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
1766 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm15 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
1767 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm4
1768 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
1769 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm10
1770 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
1771 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm7, %xmm8
1772 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm10
1773 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm2
1774 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2
1775 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
1776 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
1777 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1
1778 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2
1779 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm8
1780 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm6
1781 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm6, %xmm1
1782 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm5
1783 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0
1784 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1785 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm4
1786 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm11
1787 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm2
1788 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3
1789 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm11, %xmm11
1790 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
1791 ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm15, %xmm11, %xmm15
1792 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
1793 ; AVX1-ONLY-NEXT: vandps %ymm11, %ymm8, %ymm8
1794 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1795 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm11, %ymm15
1796 ; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm0
1797 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1798 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1799 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[3,9,15,u,u,u,u,u,u,u,u,u,u]
1800 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[1,7,13],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u]
1801 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm1
1802 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[5,11]
1803 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm7
1804 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1805 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm15, %xmm0
1806 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1807 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
1808 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1809 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm15
1810 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
1811 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1812 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm14
1813 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
1814 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm10[3,9,15,u,u,u,u,u,u,u,u,u,u]
1815 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1816 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
1817 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm10, %xmm10
1818 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5],xmm10[6,7]
1819 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
1820 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm14, %ymm0
1821 ; AVX1-ONLY-NEXT: vandps %ymm14, %ymm10, %ymm10
1822 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0
1823 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm8
1824 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm1
1825 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
1826 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[5,11]
1827 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
1828 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm10, %xmm8
1829 ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm1, %xmm8, %xmm1
1830 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0
1831 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1832 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1
1833 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
1834 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1835 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[4,10,u,u,u,u,u,u,u,u,u,u,u]
1836 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[2,8,14],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
1837 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
1838 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
1839 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
1840 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm10
1841 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
1842 ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
1843 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm12, %xmm14
1844 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm14, %xmm10
1845 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0
1846 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0
1847 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload
1848 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm10, %ymm0
1849 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[2,8,14,u,u,u,u,u,u,u,u,u,u,u,u,u]
1850 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,0,6,12,u,u,u,u,u,u,u,u]
1851 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm14[0],xmm10[0]
1852 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1853 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm11
1854 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm11, %xmm1
1855 ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm10, %xmm1, %xmm1
1856 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
1857 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0
1858 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1859 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1
1860 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm11
1861 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm9[5,11,u,u,u,u,u,u,u,u,u,u,u]
1862 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[3,9,15],zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
1863 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
1864 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
1865 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
1866 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14
1867 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
1868 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
1869 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm8
1870 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8
1871 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
1872 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
1873 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,1,7,13,u,u,u,u,u,u,u,u]
1874 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm14[0],xmm8[0]
1875 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm1
1876 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm4, %xmm14
1877 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1
1878 ; AVX1-ONLY-NEXT: vpblendvb %xmm3, %xmm8, %xmm1, %xmm1
1879 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm3, %ymm0
1880 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
1881 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
1882 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0
1883 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1884 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1
1885 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm3
1886 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[4,10],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
1887 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm9[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
1888 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
1889 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
1890 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
1891 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm8
1892 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
1893 ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
1894 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm7, %xmm15
1895 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm15, %xmm8
1896 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
1897 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
1898 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm15 # 32-byte Folded Reload
1899 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm0
1900 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm15, %ymm0
1901 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
1902 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm14
1903 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1
1904 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
1905 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
1906 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm14 = xmm15[1],xmm14[1]
1907 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4],xmm1[5,6,7]
1908 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm10, %ymm0
1909 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1910 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm10, %ymm1
1911 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
1912 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[5,11],zero,zero,zero,xmm13[u,u,u,u,u,u,u,u,u,u,u]
1913 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,7,13,u,u,u,u,u,u,u,u,u,u,u]
1914 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm9, %xmm1
1915 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
1916 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
1917 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm12, %xmm12
1918 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
1919 ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
1920 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm7, %xmm7
1921 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm12, %xmm7
1922 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
1923 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
1924 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm8, %ymm1
1925 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm1, %ymm1
1926 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4
1927 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm2, %xmm2
1928 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm2, %xmm2
1929 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,5,11,u,u,u,u,u,u]
1930 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,3,9,15]
1931 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm5[1],xmm4[1]
1932 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3,4],xmm2[5,6,7]
1933 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm10, %ymm1
1934 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
1935 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm10, %ymm2
1936 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
1937 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1938 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rsi)
1939 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
1940 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%rdx)
1941 ; AVX1-ONLY-NEXT: vmovaps %ymm11, (%rcx)
1942 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%r8)
1943 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
1944 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
1945 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rax)
1946 ; AVX1-ONLY-NEXT: addq $120, %rsp
1947 ; AVX1-ONLY-NEXT: vzeroupper
1948 ; AVX1-ONLY-NEXT: retq
1950 ; AVX2-ONLY-LABEL: load_i8_stride6_vf32:
1951 ; AVX2-ONLY: # %bb.0:
1952 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm4
1953 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm2
1954 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3
1955 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0
1956 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1
1957 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
1958 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm9
1959 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u]
1960 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10
1961 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
1962 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm11
1963 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0]
1964 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1]
1965 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
1966 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm1
1967 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
1968 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
1969 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm11, %ymm0, %ymm0
1970 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,7,13],zero,zero,zero,xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u]
1971 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[3,9,15],zero,zero,xmm10[1,7,13,u,u,u,u,u]
1972 ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
1973 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
1974 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm1, %ymm1
1975 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
1976 ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm2, %ymm9
1977 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm10
1978 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
1979 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u]
1980 ; AVX2-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
1981 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0]
1982 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm6, %ymm5, %ymm13
1983 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
1984 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm12, %ymm14, %ymm12
1985 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14
1986 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm14, %ymm4, %ymm8
1987 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,11],zero,zero,zero,xmm10[3,9,15,u,u,u,u,u]
1988 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[3,9,15],zero,zero,xmm9[1,7,13],zero,zero,zero,xmm9[u,u,u,u,u]
1989 ; AVX2-ONLY-NEXT: vpor %xmm10, %xmm9, %xmm9
1990 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,0,6,12],zero,zero,zero,xmm8[4,10],zero,zero,zero
1991 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
1992 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm9, %ymm13, %ymm13
1993 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm9
1994 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12]
1995 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm10, %xmm7
1996 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1997 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
1998 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm12, %ymm7, %ymm7
1999 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
2000 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero
2001 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13]
2002 ; AVX2-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8
2003 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm14, %ymm9
2004 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2005 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm13, %ymm8, %ymm8
2006 ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm4, %ymm14, %ymm4
2007 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm11
2008 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14]
2009 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,2,8,14],zero,zero,xmm4[0,6,12],zero,zero,zero
2010 ; AVX2-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
2011 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm2
2012 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3
2013 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u]
2014 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm5, %ymm5
2015 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
2016 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u]
2017 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm12, %xmm6
2018 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
2019 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm12[5,6,7]
2020 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
2021 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm12
2022 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm6, %ymm12, %ymm6
2023 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15]
2024 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,3,9,15],zero,zero,xmm4[1,7,13],zero,zero,zero
2025 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4
2026 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
2027 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,11],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[u,u,u,u,u,u]
2028 ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2
2029 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
2030 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
2031 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2032 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm3
2033 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2034 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2
2035 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10]
2036 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,4,10],zero,zero,zero,xmm9[2,8,14],zero,zero
2037 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
2038 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2039 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3,4,5,6,7],ymm0[8,9,10],ymm4[11,12,13,14,15]
2040 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
2041 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11]
2042 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,5,11],zero,zero,zero,xmm9[3,9,15],zero,zero
2043 ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
2044 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2045 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
2046 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2047 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rsi)
2048 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%rdx)
2049 ; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rcx)
2050 ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%r8)
2051 ; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%r9)
2052 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
2053 ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rax)
2054 ; AVX2-ONLY-NEXT: vzeroupper
2055 ; AVX2-ONLY-NEXT: retq
2057 ; AVX512F-LABEL: load_i8_stride6_vf32:
2059 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
2060 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
2061 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm17
2062 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
2063 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1
2064 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm6
2065 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm7
2066 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm17, %ymm7
2067 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
2068 ; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm8
2069 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm8[2,8,14],zero,zero,xmm8[0,6,12,u,u,u,u,u]
2070 ; AVX512F-NEXT: vpor %xmm4, %xmm5, %xmm4
2071 ; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],mem[2,3]
2072 ; AVX512F-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1
2073 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
2074 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm10
2075 ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm1, %ymm10
2076 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2077 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
2078 ; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm4, %ymm11
2079 ; AVX512F-NEXT: vmovdqa 160(%rdi), %ymm13
2080 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm14
2081 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm14
2082 ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm15
2083 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[0,6,12],zero,zero,zero,xmm15[4,10]
2084 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,4,10],zero,zero,zero,xmm14[2,8,14],zero,zero
2085 ; AVX512F-NEXT: vpor %xmm4, %xmm12, %xmm4
2086 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2087 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
2088 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2089 ; AVX512F-NEXT: vmovdqa64 %ymm2, %ymm18
2090 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
2091 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,9,15],zero,zero,xmm8[1,7,13,u,u,u,u,u]
2092 ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7
2093 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2094 ; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm7, %ymm8
2095 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[u,u,u,u,u,u],zero,zero,xmm15[1,7,13],zero,zero,zero,xmm15[5,11]
2096 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u,u,5,11],zero,zero,zero,xmm14[3,9,15],zero,zero
2097 ; AVX512F-NEXT: vpor %xmm7, %xmm10, %xmm7
2098 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2099 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
2100 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2101 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
2102 ; AVX512F-NEXT: vmovdqa %ymm8, %ymm10
2103 ; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm10
2104 ; AVX512F-NEXT: vextracti128 $1, %ymm10, %xmm11
2105 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
2106 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
2107 ; AVX512F-NEXT: vpor %xmm12, %xmm14, %xmm12
2108 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm5, %ymm9
2109 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
2110 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
2111 ; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm12, %ymm14
2112 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm12
2113 ; AVX512F-NEXT: vpternlogq $202, %ymm13, %ymm6, %ymm12
2114 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
2115 ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm2
2116 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[0,6,12]
2117 ; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm4
2118 ; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2119 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
2120 ; AVX512F-NEXT: vpternlogq $184, %ymm14, %ymm15, %ymm4
2121 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
2122 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
2123 ; AVX512F-NEXT: vpor %xmm11, %xmm10, %xmm10
2124 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
2125 ; AVX512F-NEXT: vpternlogq $248, %ymm16, %ymm10, %ymm9
2126 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
2127 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[3,9,15],zero,zero,xmm2[1,7,13]
2128 ; AVX512F-NEXT: vpor %xmm2, %xmm10, %xmm2
2129 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2130 ; AVX512F-NEXT: vpternlogq $184, %ymm9, %ymm15, %ymm2
2131 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm13, %ymm8
2132 ; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm6
2133 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[4,10],zero,zero,zero,xmm6[2,8,14]
2134 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,2,8,14],zero,zero,xmm8[0,6,12],zero,zero,zero
2135 ; AVX512F-NEXT: vpor %xmm9, %xmm10, %xmm9
2136 ; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2137 ; AVX512F-NEXT: vpternlogq $202, %ymm17, %ymm3, %ymm0
2138 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
2139 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
2140 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
2141 ; AVX512F-NEXT: vpor %xmm10, %xmm11, %xmm10
2142 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm1
2143 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
2144 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm5[5,6,7]
2145 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
2146 ; AVX512F-NEXT: vpternlogq $226, %ymm9, %ymm15, %ymm5
2147 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,zero,xmm6[5,11],zero,zero,zero,xmm6[3,9,15]
2148 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,3,9,15],zero,zero,xmm8[1,7,13],zero,zero,zero
2149 ; AVX512F-NEXT: vpor %xmm6, %xmm8, %xmm6
2150 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
2151 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,7,13],zero,zero,zero,xmm3[5,11,u,u,u,u,u,u]
2152 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
2153 ; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
2154 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
2155 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2156 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2157 ; AVX512F-NEXT: vpternlogq $226, %ymm6, %ymm15, %ymm0
2158 ; AVX512F-NEXT: vmovdqa64 %ymm18, (%rsi)
2159 ; AVX512F-NEXT: vmovdqa %ymm7, (%rdx)
2160 ; AVX512F-NEXT: vmovdqa %ymm4, (%rcx)
2161 ; AVX512F-NEXT: vmovdqa %ymm2, (%r8)
2162 ; AVX512F-NEXT: vmovdqa %ymm5, (%r9)
2163 ; AVX512F-NEXT: vmovdqa %ymm0, (%rax)
2164 ; AVX512F-NEXT: vzeroupper
2165 ; AVX512F-NEXT: retq
2167 ; AVX512BW-LABEL: load_i8_stride6_vf32:
2168 ; AVX512BW: # %bb.0:
2169 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2170 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm4
2171 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm0
2172 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3
2173 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
2174 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
2175 ; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm8
2176 ; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224
2177 ; AVX512BW-NEXT: kmovd %r10d, %k2
2178 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm8, %ymm6 {%k2}
2179 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924
2180 ; AVX512BW-NEXT: kmovd %r10d, %k1
2181 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm4, %ymm7 {%k1}
2182 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,6,12],zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[u,u,u,u,u]
2183 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm9
2184 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm9[2,8,14],zero,zero,xmm9[0,6,12,u,u,u,u,u]
2185 ; AVX512BW-NEXT: vpor %xmm3, %xmm5, %xmm5
2186 ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800
2187 ; AVX512BW-NEXT: kmovd %r10d, %k3
2188 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
2189 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm3
2190 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm10 {%k1}
2191 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11
2192 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[0,6,12],zero,zero,zero,xmm11[4,10]
2193 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,4,10],zero,zero,zero,xmm10[2,8,14],zero,zero
2194 ; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12
2195 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2196 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
2197 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7]
2198 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,7,13],zero,zero,zero,xmm7[5,11],zero,zero,zero,xmm7[u,u,u,u,u]
2199 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13,u,u,u,u,u]
2200 ; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm7
2201 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k3} = ymm6[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
2202 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[u,u,u,u,u,u],zero,zero,xmm11[1,7,13],zero,zero,zero,xmm11[5,11]
2203 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,5,11],zero,zero,zero,xmm10[3,9,15],zero,zero
2204 ; AVX512BW-NEXT: vpor %xmm6, %xmm9, %xmm6
2205 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
2206 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
2207 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
2208 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm1, %ymm9 {%k2}
2209 ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492
2210 ; AVX512BW-NEXT: kmovd %edi, %k3
2211 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm0, %ymm10 {%k3}
2212 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm11
2213 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm11[4,10],zero,zero,zero,xmm11[2,8,14,u,u,u,u,u]
2214 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[2,8,14],zero,zero,xmm10[0,6,12],zero,zero,zero,xmm10[u,u,u,u,u]
2215 ; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7
2216 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
2217 ; AVX512BW-NEXT: kmovd %edi, %k4
2218 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
2219 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k1}
2220 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,u,u,0,6,12],zero,zero,zero,xmm12[4,10],zero,zero,zero
2221 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm14
2222 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[2,8,14],zero,zero,xmm14[0,6,12]
2223 ; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13
2224 ; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2225 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
2226 ; AVX512BW-NEXT: kmovd %edi, %k2
2227 ; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm7 {%k2}
2228 ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449
2229 ; AVX512BW-NEXT: kmovd %edi, %k5
2230 ; AVX512BW-NEXT: vmovdqu16 %ymm8, %ymm1 {%k5}
2231 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm11[5,11],zero,zero,zero,xmm11[3,9,15,u,u,u,u,u]
2232 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[3,9,15],zero,zero,xmm10[1,7,13],zero,zero,zero,xmm10[u,u,u,u,u]
2233 ; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8
2234 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,u,u,5,11,1,7,13,19,25,31,21,27,u,u,u,u,u,u,u,u,u,u,u]
2235 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,1,7,13],zero,zero,zero,xmm12[5,11],zero,zero,zero
2236 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[u,u,u,u,u],zero,zero,zero,xmm14[3,9,15],zero,zero,xmm14[1,7,13]
2237 ; AVX512BW-NEXT: vpor %xmm9, %xmm10, %xmm9
2238 ; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2239 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k2}
2240 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
2241 ; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
2242 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm4
2243 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
2244 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[4,10],zero,zero,zero,xmm0[2,8,14],zero,zero,xmm0[u,u,u,u,u,u]
2245 ; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
2246 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm9[5,6,7]
2247 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2248 ; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm3 {%k3}
2249 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm2
2250 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[4,10],zero,zero,zero,xmm2[2,8,14]
2251 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,2,8,14],zero,zero,xmm3[0,6,12],zero,zero,zero
2252 ; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
2253 ; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2254 ; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k2}
2255 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
2256 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,7,13],zero,zero,zero,xmm4[5,11,u,u,u,u,u,u]
2257 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,11],zero,zero,zero,xmm0[3,9,15],zero,zero,xmm0[u,u,u,u,u,u]
2258 ; AVX512BW-NEXT: vpor %xmm4, %xmm0, %xmm0
2259 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2260 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2261 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u],zero,zero,zero,xmm2[5,11],zero,zero,zero,xmm2[3,9,15]
2262 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,3,9,15],zero,zero,xmm3[1,7,13],zero,zero,zero
2263 ; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1
2264 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2265 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
2266 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rsi)
2267 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rdx)
2268 ; AVX512BW-NEXT: vmovdqa %ymm7, (%rcx)
2269 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r8)
2270 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r9)
2271 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
2272 ; AVX512BW-NEXT: vzeroupper
2273 ; AVX512BW-NEXT: retq
2274 %wide.vec = load <192 x i8>, ptr %in.vec, align 64
2275 %strided.vec0 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186>
2276 %strided.vec1 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187>
2277 %strided.vec2 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188>
2278 %strided.vec3 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189>
2279 %strided.vec4 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190>
2280 %strided.vec5 = shufflevector <192 x i8> %wide.vec, <192 x i8> poison, <32 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191>
2281 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
2282 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
2283 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
2284 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
2285 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
2286 store <32 x i8> %strided.vec5, ptr %out.vec5, align 64
2290 define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
2291 ; SSE-LABEL: load_i8_stride6_vf64:
2293 ; SSE-NEXT: subq $792, %rsp # imm = 0x318
2294 ; SSE-NEXT: movdqa 64(%rdi), %xmm4
2295 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2296 ; SSE-NEXT: movdqa 80(%rdi), %xmm5
2297 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2298 ; SSE-NEXT: movdqa (%rdi), %xmm7
2299 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2300 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
2301 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2302 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
2303 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2304 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
2305 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
2306 ; SSE-NEXT: movdqa %xmm13, %xmm1
2307 ; SSE-NEXT: pandn %xmm2, %xmm1
2308 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
2309 ; SSE-NEXT: movdqa %xmm3, %xmm2
2310 ; SSE-NEXT: pandn %xmm0, %xmm2
2311 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2312 ; SSE-NEXT: movdqa %xmm13, %xmm2
2313 ; SSE-NEXT: pandn %xmm0, %xmm2
2314 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2315 ; SSE-NEXT: pand %xmm13, %xmm0
2316 ; SSE-NEXT: por %xmm1, %xmm0
2317 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2318 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2319 ; SSE-NEXT: pand %xmm1, %xmm0
2320 ; SSE-NEXT: movdqa %xmm1, %xmm10
2321 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
2322 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2323 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2324 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2325 ; SSE-NEXT: packuswb %xmm1, %xmm0
2326 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
2327 ; SSE-NEXT: movdqa %xmm3, %xmm1
2328 ; SSE-NEXT: pandn %xmm6, %xmm1
2329 ; SSE-NEXT: movdqa %xmm7, %xmm2
2330 ; SSE-NEXT: pand %xmm3, %xmm2
2331 ; SSE-NEXT: por %xmm1, %xmm2
2332 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2333 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
2334 ; SSE-NEXT: pand %xmm10, %xmm1
2335 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2336 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2337 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2338 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2339 ; SSE-NEXT: packuswb %xmm1, %xmm1
2340 ; SSE-NEXT: pand %xmm8, %xmm1
2341 ; SSE-NEXT: movdqa %xmm8, %xmm2
2342 ; SSE-NEXT: pandn %xmm0, %xmm2
2343 ; SSE-NEXT: por %xmm2, %xmm1
2344 ; SSE-NEXT: movdqa %xmm13, %xmm0
2345 ; SSE-NEXT: pandn %xmm5, %xmm0
2346 ; SSE-NEXT: pand %xmm13, %xmm4
2347 ; SSE-NEXT: por %xmm0, %xmm4
2348 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2349 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
2350 ; SSE-NEXT: pand %xmm10, %xmm0
2351 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2352 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
2353 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2354 ; SSE-NEXT: packuswb %xmm0, %xmm0
2355 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2356 ; SSE-NEXT: movdqa %xmm4, %xmm2
2357 ; SSE-NEXT: pandn %xmm0, %xmm2
2358 ; SSE-NEXT: pand %xmm4, %xmm1
2359 ; SSE-NEXT: por %xmm1, %xmm2
2360 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2361 ; SSE-NEXT: movdqa 320(%rdi), %xmm1
2362 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2363 ; SSE-NEXT: movdqa %xmm13, %xmm0
2364 ; SSE-NEXT: pandn %xmm1, %xmm0
2365 ; SSE-NEXT: movdqa 336(%rdi), %xmm12
2366 ; SSE-NEXT: movdqa %xmm3, %xmm1
2367 ; SSE-NEXT: pandn %xmm12, %xmm1
2368 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2369 ; SSE-NEXT: movdqa %xmm13, %xmm1
2370 ; SSE-NEXT: pandn %xmm12, %xmm1
2371 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2372 ; SSE-NEXT: pand %xmm13, %xmm12
2373 ; SSE-NEXT: por %xmm0, %xmm12
2374 ; SSE-NEXT: movdqa %xmm12, %xmm0
2375 ; SSE-NEXT: pand %xmm10, %xmm0
2376 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
2377 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2378 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2379 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2380 ; SSE-NEXT: packuswb %xmm1, %xmm0
2381 ; SSE-NEXT: movdqa %xmm8, %xmm1
2382 ; SSE-NEXT: pandn %xmm0, %xmm1
2383 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
2384 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2385 ; SSE-NEXT: movdqa %xmm3, %xmm7
2386 ; SSE-NEXT: movdqa %xmm3, %xmm0
2387 ; SSE-NEXT: pandn %xmm2, %xmm0
2388 ; SSE-NEXT: movdqa 288(%rdi), %xmm6
2389 ; SSE-NEXT: movdqa %xmm6, %xmm2
2390 ; SSE-NEXT: pand %xmm3, %xmm2
2391 ; SSE-NEXT: por %xmm0, %xmm2
2392 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2393 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
2394 ; SSE-NEXT: pand %xmm10, %xmm0
2395 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2396 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2397 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2398 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2399 ; SSE-NEXT: packuswb %xmm0, %xmm0
2400 ; SSE-NEXT: pand %xmm8, %xmm0
2401 ; SSE-NEXT: por %xmm1, %xmm0
2402 ; SSE-NEXT: movdqa 368(%rdi), %xmm1
2403 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2404 ; SSE-NEXT: movdqa %xmm13, %xmm2
2405 ; SSE-NEXT: pandn %xmm1, %xmm2
2406 ; SSE-NEXT: movdqa 352(%rdi), %xmm3
2407 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2408 ; SSE-NEXT: pand %xmm13, %xmm3
2409 ; SSE-NEXT: por %xmm2, %xmm3
2410 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2411 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0]
2412 ; SSE-NEXT: pand %xmm10, %xmm2
2413 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
2414 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
2415 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
2416 ; SSE-NEXT: packuswb %xmm2, %xmm2
2417 ; SSE-NEXT: movdqa %xmm4, %xmm3
2418 ; SSE-NEXT: pandn %xmm2, %xmm3
2419 ; SSE-NEXT: pand %xmm4, %xmm0
2420 ; SSE-NEXT: movdqa %xmm4, %xmm9
2421 ; SSE-NEXT: por %xmm0, %xmm3
2422 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2423 ; SSE-NEXT: movdqa 224(%rdi), %xmm1
2424 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2425 ; SSE-NEXT: movdqa %xmm13, %xmm0
2426 ; SSE-NEXT: pandn %xmm1, %xmm0
2427 ; SSE-NEXT: movdqa 240(%rdi), %xmm11
2428 ; SSE-NEXT: movdqa %xmm7, %xmm2
2429 ; SSE-NEXT: pandn %xmm11, %xmm2
2430 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2431 ; SSE-NEXT: movdqa %xmm13, %xmm2
2432 ; SSE-NEXT: pandn %xmm11, %xmm2
2433 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2434 ; SSE-NEXT: pand %xmm13, %xmm11
2435 ; SSE-NEXT: por %xmm0, %xmm11
2436 ; SSE-NEXT: movdqa %xmm11, %xmm0
2437 ; SSE-NEXT: pand %xmm10, %xmm0
2438 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7]
2439 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2440 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2441 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2442 ; SSE-NEXT: packuswb %xmm2, %xmm0
2443 ; SSE-NEXT: movdqa %xmm8, %xmm2
2444 ; SSE-NEXT: pandn %xmm0, %xmm2
2445 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
2446 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2447 ; SSE-NEXT: movdqa %xmm7, %xmm0
2448 ; SSE-NEXT: pandn %xmm1, %xmm0
2449 ; SSE-NEXT: movdqa 192(%rdi), %xmm3
2450 ; SSE-NEXT: movdqa %xmm3, %xmm1
2451 ; SSE-NEXT: pand %xmm7, %xmm1
2452 ; SSE-NEXT: por %xmm0, %xmm1
2453 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2454 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
2455 ; SSE-NEXT: movdqa %xmm10, %xmm1
2456 ; SSE-NEXT: pand %xmm10, %xmm0
2457 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2458 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
2459 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2460 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2461 ; SSE-NEXT: packuswb %xmm0, %xmm0
2462 ; SSE-NEXT: pand %xmm8, %xmm0
2463 ; SSE-NEXT: movdqa %xmm8, %xmm10
2464 ; SSE-NEXT: por %xmm2, %xmm0
2465 ; SSE-NEXT: movdqa 272(%rdi), %xmm14
2466 ; SSE-NEXT: movdqa %xmm13, %xmm2
2467 ; SSE-NEXT: pandn %xmm14, %xmm2
2468 ; SSE-NEXT: movdqa 256(%rdi), %xmm15
2469 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2470 ; SSE-NEXT: pand %xmm13, %xmm15
2471 ; SSE-NEXT: por %xmm2, %xmm15
2472 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0]
2473 ; SSE-NEXT: pand %xmm1, %xmm2
2474 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
2475 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
2476 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
2477 ; SSE-NEXT: packuswb %xmm2, %xmm2
2478 ; SSE-NEXT: pandn %xmm2, %xmm4
2479 ; SSE-NEXT: pand %xmm9, %xmm0
2480 ; SSE-NEXT: por %xmm0, %xmm4
2481 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2482 ; SSE-NEXT: movdqa 128(%rdi), %xmm2
2483 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
2484 ; SSE-NEXT: movdqa %xmm13, %xmm0
2485 ; SSE-NEXT: pandn %xmm2, %xmm0
2486 ; SSE-NEXT: movdqa 144(%rdi), %xmm9
2487 ; SSE-NEXT: movdqa %xmm7, %xmm4
2488 ; SSE-NEXT: pandn %xmm9, %xmm4
2489 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2490 ; SSE-NEXT: movdqa %xmm13, %xmm4
2491 ; SSE-NEXT: pandn %xmm9, %xmm4
2492 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2493 ; SSE-NEXT: pand %xmm13, %xmm9
2494 ; SSE-NEXT: por %xmm0, %xmm9
2495 ; SSE-NEXT: movdqa %xmm9, %xmm0
2496 ; SSE-NEXT: pand %xmm1, %xmm0
2497 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7]
2498 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
2499 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2500 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2501 ; SSE-NEXT: packuswb %xmm5, %xmm0
2502 ; SSE-NEXT: pandn %xmm0, %xmm10
2503 ; SSE-NEXT: movdqa %xmm13, %xmm0
2504 ; SSE-NEXT: movdqa %xmm13, %xmm2
2505 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2506 ; SSE-NEXT: pandn %xmm13, %xmm2
2507 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2508 ; SSE-NEXT: movdqa %xmm13, %xmm1
2509 ; SSE-NEXT: movdqa %xmm0, %xmm2
2510 ; SSE-NEXT: pandn %xmm6, %xmm2
2511 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2512 ; SSE-NEXT: movdqa %xmm6, %xmm5
2513 ; SSE-NEXT: movdqa %xmm0, %xmm2
2514 ; SSE-NEXT: pandn %xmm3, %xmm2
2515 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2516 ; SSE-NEXT: movdqa %xmm3, %xmm4
2517 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
2518 ; SSE-NEXT: movdqa %xmm7, %xmm2
2519 ; SSE-NEXT: movdqa %xmm7, %xmm8
2520 ; SSE-NEXT: pandn %xmm6, %xmm8
2521 ; SSE-NEXT: movdqa 160(%rdi), %xmm7
2522 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2523 ; SSE-NEXT: pand %xmm0, %xmm7
2524 ; SSE-NEXT: movdqa %xmm0, %xmm3
2525 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2526 ; SSE-NEXT: pandn %xmm13, %xmm3
2527 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2528 ; SSE-NEXT: pand %xmm0, %xmm1
2529 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530 ; SSE-NEXT: movdqa %xmm2, %xmm3
2531 ; SSE-NEXT: movdqa %xmm2, %xmm1
2532 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2533 ; SSE-NEXT: pandn %xmm2, %xmm3
2534 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2535 ; SSE-NEXT: pand %xmm0, %xmm2
2536 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2537 ; SSE-NEXT: movdqa %xmm0, %xmm2
2538 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2539 ; SSE-NEXT: pandn %xmm3, %xmm2
2540 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2541 ; SSE-NEXT: pand %xmm0, %xmm5
2542 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2543 ; SSE-NEXT: movdqa %xmm1, %xmm5
2544 ; SSE-NEXT: movdqa %xmm1, %xmm2
2545 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2546 ; SSE-NEXT: pandn %xmm1, %xmm2
2547 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2548 ; SSE-NEXT: pand %xmm0, %xmm1
2549 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2550 ; SSE-NEXT: movdqa %xmm0, %xmm2
2551 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2552 ; SSE-NEXT: pandn %xmm1, %xmm2
2553 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2554 ; SSE-NEXT: pand %xmm0, %xmm4
2555 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2556 ; SSE-NEXT: pandn %xmm14, %xmm5
2557 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2558 ; SSE-NEXT: pand %xmm0, %xmm14
2559 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2560 ; SSE-NEXT: movdqa %xmm0, %xmm2
2561 ; SSE-NEXT: pandn %xmm6, %xmm2
2562 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2563 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
2564 ; SSE-NEXT: movdqa %xmm4, %xmm2
2565 ; SSE-NEXT: pand %xmm0, %xmm2
2566 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2567 ; SSE-NEXT: movdqa 176(%rdi), %xmm14
2568 ; SSE-NEXT: movdqa %xmm14, %xmm2
2569 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2570 ; SSE-NEXT: pand %xmm0, %xmm2
2571 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2572 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2573 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2574 ; SSE-NEXT: pand %xmm0, %xmm2
2575 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2576 ; SSE-NEXT: pand %xmm0, %xmm13
2577 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2579 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2580 ; SSE-NEXT: pand %xmm0, %xmm2
2581 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2582 ; SSE-NEXT: pand %xmm0, %xmm3
2583 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2584 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2585 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2586 ; SSE-NEXT: pand %xmm0, %xmm2
2587 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2588 ; SSE-NEXT: pand %xmm0, %xmm1
2589 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2590 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
2591 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2592 ; SSE-NEXT: pand %xmm0, %xmm1
2593 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
2594 ; SSE-NEXT: movdqa %xmm0, %xmm1
2595 ; SSE-NEXT: pand %xmm0, %xmm6
2596 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2597 ; SSE-NEXT: movdqa %xmm0, %xmm13
2598 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2599 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2600 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2601 ; SSE-NEXT: pandn %xmm4, %xmm1
2602 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2603 ; SSE-NEXT: movdqa %xmm4, %xmm3
2604 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2605 ; SSE-NEXT: por %xmm8, %xmm3
2606 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,1,3]
2607 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
2608 ; SSE-NEXT: pand %xmm1, %xmm5
2609 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
2610 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
2611 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
2612 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
2613 ; SSE-NEXT: packuswb %xmm5, %xmm5
2614 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
2615 ; SSE-NEXT: pand %xmm8, %xmm5
2616 ; SSE-NEXT: por %xmm10, %xmm5
2617 ; SSE-NEXT: pandn %xmm14, %xmm0
2618 ; SSE-NEXT: por %xmm0, %xmm7
2619 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
2620 ; SSE-NEXT: pand %xmm1, %xmm0
2621 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2622 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
2623 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2624 ; SSE-NEXT: packuswb %xmm0, %xmm0
2625 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
2626 ; SSE-NEXT: movdqa %xmm10, %xmm1
2627 ; SSE-NEXT: pandn %xmm0, %xmm1
2628 ; SSE-NEXT: pand %xmm10, %xmm5
2629 ; SSE-NEXT: por %xmm5, %xmm1
2630 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2631 ; SSE-NEXT: pxor %xmm5, %xmm5
2632 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2633 ; SSE-NEXT: movdqa %xmm1, %xmm0
2634 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2635 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2636 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
2637 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
2638 ; SSE-NEXT: psrld $16, %xmm0
2639 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
2640 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
2641 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
2642 ; SSE-NEXT: packuswb %xmm14, %xmm4
2643 ; SSE-NEXT: movdqa %xmm8, %xmm1
2644 ; SSE-NEXT: pandn %xmm4, %xmm1
2645 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2646 ; SSE-NEXT: movdqa %xmm2, %xmm4
2647 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
2648 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
2649 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
2650 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
2651 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
2652 ; SSE-NEXT: movdqa %xmm0, %xmm14
2653 ; SSE-NEXT: pandn %xmm4, %xmm14
2654 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
2655 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7]
2656 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
2657 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
2658 ; SSE-NEXT: pand %xmm0, %xmm4
2659 ; SSE-NEXT: por %xmm14, %xmm4
2660 ; SSE-NEXT: packuswb %xmm4, %xmm4
2661 ; SSE-NEXT: pand %xmm8, %xmm4
2662 ; SSE-NEXT: por %xmm1, %xmm4
2663 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2664 ; SSE-NEXT: movdqa %xmm6, %xmm1
2665 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2666 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,5,5,5]
2667 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
2668 ; SSE-NEXT: movdqa %xmm2, %xmm1
2669 ; SSE-NEXT: pandn %xmm14, %xmm1
2670 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2671 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm6[3,1,2,3,4,5,6,7]
2672 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2673 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,4]
2674 ; SSE-NEXT: pand %xmm2, %xmm14
2675 ; SSE-NEXT: por %xmm1, %xmm14
2676 ; SSE-NEXT: packuswb %xmm14, %xmm1
2677 ; SSE-NEXT: movdqa %xmm10, %xmm14
2678 ; SSE-NEXT: pandn %xmm1, %xmm14
2679 ; SSE-NEXT: pand %xmm10, %xmm4
2680 ; SSE-NEXT: por %xmm4, %xmm14
2681 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2682 ; SSE-NEXT: movdqa %xmm12, %xmm1
2683 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2684 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
2685 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
2686 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
2687 ; SSE-NEXT: psrld $16, %xmm1
2688 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
2689 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7]
2690 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
2691 ; SSE-NEXT: packuswb %xmm12, %xmm4
2692 ; SSE-NEXT: movdqa %xmm8, %xmm14
2693 ; SSE-NEXT: movdqa %xmm8, %xmm1
2694 ; SSE-NEXT: pandn %xmm4, %xmm1
2695 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2696 ; SSE-NEXT: movdqa %xmm6, %xmm4
2697 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
2698 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
2699 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
2700 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
2701 ; SSE-NEXT: movdqa %xmm0, %xmm12
2702 ; SSE-NEXT: pandn %xmm4, %xmm12
2703 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2704 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
2705 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
2706 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
2707 ; SSE-NEXT: pand %xmm0, %xmm4
2708 ; SSE-NEXT: por %xmm12, %xmm4
2709 ; SSE-NEXT: packuswb %xmm4, %xmm4
2710 ; SSE-NEXT: pand %xmm8, %xmm4
2711 ; SSE-NEXT: por %xmm1, %xmm4
2712 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2713 ; SSE-NEXT: movdqa %xmm6, %xmm1
2714 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2715 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2716 ; SSE-NEXT: movdqa %xmm2, %xmm12
2717 ; SSE-NEXT: pandn %xmm1, %xmm12
2718 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
2719 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7]
2720 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2721 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
2722 ; SSE-NEXT: pand %xmm2, %xmm1
2723 ; SSE-NEXT: por %xmm12, %xmm1
2724 ; SSE-NEXT: packuswb %xmm1, %xmm1
2725 ; SSE-NEXT: movdqa %xmm10, %xmm12
2726 ; SSE-NEXT: pandn %xmm1, %xmm12
2727 ; SSE-NEXT: pand %xmm10, %xmm4
2728 ; SSE-NEXT: por %xmm4, %xmm12
2729 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2730 ; SSE-NEXT: movdqa %xmm11, %xmm1
2731 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2732 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
2733 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[2,2,3,3]
2734 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
2735 ; SSE-NEXT: psrld $16, %xmm1
2736 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[0,1,0,3]
2737 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
2738 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
2739 ; SSE-NEXT: packuswb %xmm8, %xmm4
2740 ; SSE-NEXT: movdqa %xmm14, %xmm1
2741 ; SSE-NEXT: pandn %xmm4, %xmm1
2742 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2743 ; SSE-NEXT: movdqa %xmm6, %xmm4
2744 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
2745 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
2746 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
2747 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
2748 ; SSE-NEXT: movdqa %xmm0, %xmm8
2749 ; SSE-NEXT: pandn %xmm4, %xmm8
2750 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
2751 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
2752 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
2753 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
2754 ; SSE-NEXT: pand %xmm0, %xmm4
2755 ; SSE-NEXT: por %xmm8, %xmm4
2756 ; SSE-NEXT: packuswb %xmm4, %xmm4
2757 ; SSE-NEXT: pand %xmm14, %xmm4
2758 ; SSE-NEXT: por %xmm1, %xmm4
2759 ; SSE-NEXT: movdqa %xmm15, %xmm1
2760 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2761 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2762 ; SSE-NEXT: movdqa %xmm2, %xmm8
2763 ; SSE-NEXT: pandn %xmm1, %xmm8
2764 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15]
2765 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7]
2766 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2767 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
2768 ; SSE-NEXT: pand %xmm2, %xmm1
2769 ; SSE-NEXT: por %xmm8, %xmm1
2770 ; SSE-NEXT: packuswb %xmm1, %xmm1
2771 ; SSE-NEXT: movdqa %xmm10, %xmm8
2772 ; SSE-NEXT: pandn %xmm1, %xmm8
2773 ; SSE-NEXT: pand %xmm10, %xmm4
2774 ; SSE-NEXT: por %xmm4, %xmm8
2775 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2776 ; SSE-NEXT: movdqa %xmm9, %xmm1
2777 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2778 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
2779 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[2,2,3,3]
2780 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
2781 ; SSE-NEXT: psrld $16, %xmm1
2782 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3]
2783 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
2784 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
2785 ; SSE-NEXT: packuswb %xmm6, %xmm4
2786 ; SSE-NEXT: movdqa %xmm3, %xmm1
2787 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2788 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
2789 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
2790 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
2791 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
2792 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2793 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
2794 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,0,4,5,6,7]
2795 ; SSE-NEXT: pand %xmm0, %xmm3
2796 ; SSE-NEXT: pandn %xmm1, %xmm0
2797 ; SSE-NEXT: por %xmm3, %xmm0
2798 ; SSE-NEXT: packuswb %xmm0, %xmm0
2799 ; SSE-NEXT: movdqa %xmm14, %xmm1
2800 ; SSE-NEXT: pand %xmm14, %xmm0
2801 ; SSE-NEXT: pandn %xmm4, %xmm1
2802 ; SSE-NEXT: por %xmm1, %xmm0
2803 ; SSE-NEXT: movdqa %xmm7, %xmm1
2804 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
2805 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
2806 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
2807 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
2808 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2809 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4]
2810 ; SSE-NEXT: pand %xmm2, %xmm3
2811 ; SSE-NEXT: pandn %xmm1, %xmm2
2812 ; SSE-NEXT: por %xmm3, %xmm2
2813 ; SSE-NEXT: packuswb %xmm2, %xmm1
2814 ; SSE-NEXT: movdqa %xmm10, %xmm2
2815 ; SSE-NEXT: pandn %xmm1, %xmm2
2816 ; SSE-NEXT: pand %xmm10, %xmm0
2817 ; SSE-NEXT: movdqa %xmm10, %xmm11
2818 ; SSE-NEXT: por %xmm0, %xmm2
2819 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2820 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0]
2821 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2822 ; SSE-NEXT: pand %xmm14, %xmm4
2823 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2824 ; SSE-NEXT: movdqa %xmm4, %xmm0
2825 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
2826 ; SSE-NEXT: pand %xmm10, %xmm0
2827 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
2828 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2829 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
2830 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2831 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2832 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
2833 ; SSE-NEXT: packuswb %xmm1, %xmm0
2834 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2835 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2836 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7]
2837 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
2838 ; SSE-NEXT: pand %xmm10, %xmm1
2839 ; SSE-NEXT: movdqa %xmm10, %xmm12
2840 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
2841 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2842 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5]
2843 ; SSE-NEXT: packuswb %xmm2, %xmm2
2844 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
2845 ; SSE-NEXT: movdqa %xmm5, %xmm3
2846 ; SSE-NEXT: pandn %xmm2, %xmm3
2847 ; SSE-NEXT: pand %xmm5, %xmm0
2848 ; SSE-NEXT: por %xmm0, %xmm3
2849 ; SSE-NEXT: movdqa %xmm13, %xmm0
2850 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2851 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2852 ; SSE-NEXT: por %xmm0, %xmm1
2853 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2854 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
2855 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2856 ; SSE-NEXT: pand %xmm10, %xmm0
2857 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2858 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2859 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2860 ; SSE-NEXT: packuswb %xmm0, %xmm0
2861 ; SSE-NEXT: movdqa %xmm11, %xmm2
2862 ; SSE-NEXT: pandn %xmm0, %xmm2
2863 ; SSE-NEXT: pand %xmm11, %xmm3
2864 ; SSE-NEXT: movdqa %xmm11, %xmm8
2865 ; SSE-NEXT: por %xmm3, %xmm2
2866 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2867 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2868 ; SSE-NEXT: pand %xmm14, %xmm13
2869 ; SSE-NEXT: movdqa %xmm14, %xmm7
2870 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2871 ; SSE-NEXT: movdqa %xmm13, %xmm0
2872 ; SSE-NEXT: pand %xmm10, %xmm0
2873 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
2874 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2875 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
2876 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2877 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2878 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
2879 ; SSE-NEXT: packuswb %xmm2, %xmm0
2880 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2881 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2882 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7]
2883 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2884 ; SSE-NEXT: pand %xmm10, %xmm2
2885 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2886 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2887 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2888 ; SSE-NEXT: packuswb %xmm2, %xmm2
2889 ; SSE-NEXT: movdqa %xmm5, %xmm3
2890 ; SSE-NEXT: pandn %xmm2, %xmm3
2891 ; SSE-NEXT: pand %xmm5, %xmm0
2892 ; SSE-NEXT: por %xmm0, %xmm3
2893 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2894 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2895 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2896 ; SSE-NEXT: por %xmm0, %xmm11
2897 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7]
2898 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2899 ; SSE-NEXT: pand %xmm10, %xmm0
2900 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2901 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2902 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2903 ; SSE-NEXT: packuswb %xmm0, %xmm0
2904 ; SSE-NEXT: movdqa %xmm8, %xmm2
2905 ; SSE-NEXT: pandn %xmm0, %xmm2
2906 ; SSE-NEXT: pand %xmm8, %xmm3
2907 ; SSE-NEXT: movdqa %xmm8, %xmm9
2908 ; SSE-NEXT: por %xmm3, %xmm2
2909 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2910 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2911 ; SSE-NEXT: pand %xmm7, %xmm10
2912 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2913 ; SSE-NEXT: movdqa %xmm10, %xmm0
2914 ; SSE-NEXT: pand %xmm12, %xmm0
2915 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
2916 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2917 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
2918 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2919 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2920 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
2921 ; SSE-NEXT: packuswb %xmm2, %xmm0
2922 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2923 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2924 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2925 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7]
2926 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2927 ; SSE-NEXT: pand %xmm12, %xmm2
2928 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2929 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2930 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2931 ; SSE-NEXT: packuswb %xmm2, %xmm2
2932 ; SSE-NEXT: movdqa %xmm5, %xmm3
2933 ; SSE-NEXT: pandn %xmm2, %xmm3
2934 ; SSE-NEXT: pand %xmm5, %xmm0
2935 ; SSE-NEXT: por %xmm0, %xmm3
2936 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2937 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2938 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2939 ; SSE-NEXT: por %xmm0, %xmm8
2940 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7]
2941 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2942 ; SSE-NEXT: pand %xmm12, %xmm0
2943 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2944 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2945 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2946 ; SSE-NEXT: packuswb %xmm0, %xmm0
2947 ; SSE-NEXT: movdqa %xmm9, %xmm2
2948 ; SSE-NEXT: pandn %xmm0, %xmm2
2949 ; SSE-NEXT: pand %xmm9, %xmm3
2950 ; SSE-NEXT: movdqa %xmm9, %xmm1
2951 ; SSE-NEXT: por %xmm3, %xmm2
2952 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2953 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2954 ; SSE-NEXT: pand %xmm7, %xmm0
2955 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2956 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2957 ; SSE-NEXT: pand %xmm12, %xmm0
2958 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
2959 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
2960 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
2961 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2962 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2963 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
2964 ; SSE-NEXT: packuswb %xmm2, %xmm0
2965 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2966 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2967 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7]
2968 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2969 ; SSE-NEXT: pand %xmm12, %xmm2
2970 ; SSE-NEXT: movdqa %xmm12, %xmm9
2971 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
2972 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2973 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2974 ; SSE-NEXT: packuswb %xmm2, %xmm2
2975 ; SSE-NEXT: movdqa %xmm5, %xmm3
2976 ; SSE-NEXT: pandn %xmm2, %xmm3
2977 ; SSE-NEXT: pand %xmm5, %xmm0
2978 ; SSE-NEXT: por %xmm0, %xmm3
2979 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2980 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2981 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2982 ; SSE-NEXT: por %xmm0, %xmm12
2983 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7]
2984 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2985 ; SSE-NEXT: pand %xmm9, %xmm0
2986 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2987 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
2988 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
2989 ; SSE-NEXT: packuswb %xmm0, %xmm0
2990 ; SSE-NEXT: movdqa %xmm1, %xmm9
2991 ; SSE-NEXT: movdqa %xmm1, %xmm2
2992 ; SSE-NEXT: pandn %xmm0, %xmm2
2993 ; SSE-NEXT: pand %xmm1, %xmm3
2994 ; SSE-NEXT: por %xmm3, %xmm2
2995 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2996 ; SSE-NEXT: movdqa %xmm4, %xmm0
2997 ; SSE-NEXT: pxor %xmm1, %xmm1
2998 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2999 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
3000 ; SSE-NEXT: pxor %xmm7, %xmm7
3001 ; SSE-NEXT: movdqa %xmm4, %xmm2
3002 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
3003 ; SSE-NEXT: movaps %xmm0, %xmm3
3004 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
3005 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
3006 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3]
3007 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
3008 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3009 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
3010 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3011 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3012 ; SSE-NEXT: packuswb %xmm0, %xmm2
3013 ; SSE-NEXT: movdqa %xmm6, %xmm0
3014 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
3015 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3016 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
3017 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535]
3018 ; SSE-NEXT: movdqa %xmm3, %xmm4
3019 ; SSE-NEXT: pandn %xmm0, %xmm4
3020 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
3021 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1]
3022 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3023 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3024 ; SSE-NEXT: pand %xmm3, %xmm0
3025 ; SSE-NEXT: por %xmm4, %xmm0
3026 ; SSE-NEXT: packuswb %xmm0, %xmm0
3027 ; SSE-NEXT: movdqa %xmm5, %xmm6
3028 ; SSE-NEXT: pandn %xmm0, %xmm6
3029 ; SSE-NEXT: pand %xmm5, %xmm2
3030 ; SSE-NEXT: por %xmm2, %xmm6
3031 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3032 ; SSE-NEXT: movdqa %xmm1, %xmm0
3033 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
3034 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3035 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
3036 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,0]
3037 ; SSE-NEXT: movdqa %xmm4, %xmm2
3038 ; SSE-NEXT: pandn %xmm0, %xmm2
3039 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
3040 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3]
3041 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
3042 ; SSE-NEXT: pand %xmm4, %xmm0
3043 ; SSE-NEXT: por %xmm2, %xmm0
3044 ; SSE-NEXT: packuswb %xmm0, %xmm0
3045 ; SSE-NEXT: movdqa %xmm9, %xmm2
3046 ; SSE-NEXT: pandn %xmm0, %xmm2
3047 ; SSE-NEXT: pand %xmm9, %xmm6
3048 ; SSE-NEXT: por %xmm6, %xmm2
3049 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3050 ; SSE-NEXT: movdqa %xmm13, %xmm0
3051 ; SSE-NEXT: pxor %xmm1, %xmm1
3052 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3053 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
3054 ; SSE-NEXT: movdqa %xmm13, %xmm2
3055 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
3056 ; SSE-NEXT: movaps %xmm0, %xmm6
3057 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
3058 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
3059 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
3060 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
3061 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3062 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
3063 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3064 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3065 ; SSE-NEXT: packuswb %xmm0, %xmm2
3066 ; SSE-NEXT: movdqa %xmm14, %xmm0
3067 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3068 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3069 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
3070 ; SSE-NEXT: movdqa %xmm3, %xmm6
3071 ; SSE-NEXT: pandn %xmm0, %xmm6
3072 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
3073 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,3,2,1]
3074 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3075 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3076 ; SSE-NEXT: pand %xmm3, %xmm0
3077 ; SSE-NEXT: por %xmm6, %xmm0
3078 ; SSE-NEXT: packuswb %xmm0, %xmm0
3079 ; SSE-NEXT: movdqa %xmm5, %xmm6
3080 ; SSE-NEXT: pandn %xmm0, %xmm6
3081 ; SSE-NEXT: pand %xmm5, %xmm2
3082 ; SSE-NEXT: por %xmm2, %xmm6
3083 ; SSE-NEXT: movdqa %xmm11, %xmm0
3084 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3085 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3086 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
3087 ; SSE-NEXT: movdqa %xmm4, %xmm2
3088 ; SSE-NEXT: pandn %xmm0, %xmm2
3089 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
3090 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3]
3091 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
3092 ; SSE-NEXT: pand %xmm4, %xmm0
3093 ; SSE-NEXT: por %xmm2, %xmm0
3094 ; SSE-NEXT: packuswb %xmm0, %xmm0
3095 ; SSE-NEXT: movdqa %xmm9, %xmm2
3096 ; SSE-NEXT: pandn %xmm0, %xmm2
3097 ; SSE-NEXT: pand %xmm9, %xmm6
3098 ; SSE-NEXT: por %xmm6, %xmm2
3099 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3100 ; SSE-NEXT: movdqa %xmm10, %xmm0
3101 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3102 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
3103 ; SSE-NEXT: movdqa %xmm10, %xmm2
3104 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
3105 ; SSE-NEXT: movaps %xmm0, %xmm6
3106 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
3107 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
3108 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3]
3109 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
3110 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3111 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
3112 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
3113 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3114 ; SSE-NEXT: packuswb %xmm0, %xmm2
3115 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3116 ; SSE-NEXT: movdqa %xmm7, %xmm0
3117 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3118 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
3119 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
3120 ; SSE-NEXT: movdqa %xmm3, %xmm6
3121 ; SSE-NEXT: pandn %xmm0, %xmm6
3122 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
3123 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1]
3124 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
3125 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
3126 ; SSE-NEXT: pand %xmm3, %xmm0
3127 ; SSE-NEXT: por %xmm6, %xmm0
3128 ; SSE-NEXT: packuswb %xmm0, %xmm0
3129 ; SSE-NEXT: movdqa %xmm5, %xmm6
3130 ; SSE-NEXT: pandn %xmm0, %xmm6
3131 ; SSE-NEXT: pand %xmm5, %xmm2
3132 ; SSE-NEXT: por %xmm2, %xmm6
3133 ; SSE-NEXT: movdqa %xmm8, %xmm0
3134 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3135 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3136 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
3137 ; SSE-NEXT: movdqa %xmm4, %xmm2
3138 ; SSE-NEXT: pandn %xmm0, %xmm2
3139 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
3140 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
3141 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
3142 ; SSE-NEXT: pand %xmm4, %xmm0
3143 ; SSE-NEXT: por %xmm2, %xmm0
3144 ; SSE-NEXT: packuswb %xmm0, %xmm2
3145 ; SSE-NEXT: movdqa %xmm9, %xmm0
3146 ; SSE-NEXT: pandn %xmm2, %xmm0
3147 ; SSE-NEXT: pand %xmm9, %xmm6
3148 ; SSE-NEXT: por %xmm6, %xmm0
3149 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3150 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3151 ; SSE-NEXT: movdqa %xmm0, %xmm2
3152 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3153 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3154 ; SSE-NEXT: movdqa %xmm0, %xmm6
3155 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,0]
3156 ; SSE-NEXT: movaps %xmm2, %xmm7
3157 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2]
3158 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
3159 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
3160 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
3161 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
3162 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
3163 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
3164 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
3165 ; SSE-NEXT: packuswb %xmm2, %xmm6
3166 ; SSE-NEXT: movdqa %xmm15, %xmm2
3167 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
3168 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
3169 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
3170 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
3171 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm15[0,3,2,1]
3172 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
3173 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
3174 ; SSE-NEXT: pand %xmm3, %xmm7
3175 ; SSE-NEXT: pandn %xmm2, %xmm3
3176 ; SSE-NEXT: por %xmm7, %xmm3
3177 ; SSE-NEXT: pand %xmm5, %xmm6
3178 ; SSE-NEXT: packuswb %xmm3, %xmm3
3179 ; SSE-NEXT: pandn %xmm3, %xmm5
3180 ; SSE-NEXT: por %xmm6, %xmm5
3181 ; SSE-NEXT: movdqa %xmm12, %xmm2
3182 ; SSE-NEXT: pxor %xmm0, %xmm0
3183 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
3184 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
3185 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
3186 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
3187 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm12[0,2,0,3]
3188 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
3189 ; SSE-NEXT: pand %xmm4, %xmm3
3190 ; SSE-NEXT: pandn %xmm2, %xmm4
3191 ; SSE-NEXT: por %xmm3, %xmm4
3192 ; SSE-NEXT: pand %xmm9, %xmm5
3193 ; SSE-NEXT: packuswb %xmm4, %xmm2
3194 ; SSE-NEXT: pandn %xmm2, %xmm9
3195 ; SSE-NEXT: por %xmm5, %xmm9
3196 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3197 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3198 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3199 ; SSE-NEXT: movdqa %xmm0, %xmm1
3200 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
3201 ; SSE-NEXT: pand %xmm10, %xmm1
3202 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
3203 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
3204 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
3205 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
3206 ; SSE-NEXT: packuswb %xmm2, %xmm1
3207 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
3208 ; SSE-NEXT: movdqa %xmm15, %xmm2
3209 ; SSE-NEXT: pandn %xmm1, %xmm2
3210 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3211 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3212 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0]
3213 ; SSE-NEXT: pand %xmm10, %xmm1
3214 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
3215 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
3216 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,0,3,4,5,6,7]
3217 ; SSE-NEXT: packuswb %xmm6, %xmm6
3218 ; SSE-NEXT: pand %xmm15, %xmm6
3219 ; SSE-NEXT: por %xmm2, %xmm6
3220 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3221 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
3222 ; SSE-NEXT: pand %xmm11, %xmm13
3223 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3224 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
3225 ; SSE-NEXT: pand %xmm10, %xmm1
3226 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
3227 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
3228 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
3229 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
3230 ; SSE-NEXT: packuswb %xmm1, %xmm2
3231 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
3232 ; SSE-NEXT: movdqa %xmm1, %xmm3
3233 ; SSE-NEXT: pandn %xmm2, %xmm3
3234 ; SSE-NEXT: pand %xmm1, %xmm6
3235 ; SSE-NEXT: por %xmm6, %xmm3
3236 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3237 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3238 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
3239 ; SSE-NEXT: movdqa %xmm14, %xmm2
3240 ; SSE-NEXT: pand %xmm10, %xmm2
3241 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
3242 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
3243 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
3244 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
3245 ; SSE-NEXT: packuswb %xmm6, %xmm2
3246 ; SSE-NEXT: movdqa %xmm15, %xmm6
3247 ; SSE-NEXT: pandn %xmm2, %xmm6
3248 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3249 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3250 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
3251 ; SSE-NEXT: pand %xmm10, %xmm2
3252 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
3253 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
3254 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,0,3,4,5,6,7]
3255 ; SSE-NEXT: packuswb %xmm7, %xmm7
3256 ; SSE-NEXT: pand %xmm15, %xmm7
3257 ; SSE-NEXT: por %xmm6, %xmm7
3258 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3259 ; SSE-NEXT: pand %xmm11, %xmm3
3260 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3261 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
3262 ; SSE-NEXT: pand %xmm10, %xmm2
3263 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
3264 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
3265 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
3266 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
3267 ; SSE-NEXT: packuswb %xmm2, %xmm6
3268 ; SSE-NEXT: movdqa %xmm1, %xmm2
3269 ; SSE-NEXT: pandn %xmm6, %xmm2
3270 ; SSE-NEXT: pand %xmm1, %xmm7
3271 ; SSE-NEXT: por %xmm7, %xmm2
3272 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3273 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3274 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3275 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3276 ; SSE-NEXT: pand %xmm10, %xmm6
3277 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
3278 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
3279 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
3280 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
3281 ; SSE-NEXT: packuswb %xmm7, %xmm6
3282 ; SSE-NEXT: movdqa %xmm15, %xmm7
3283 ; SSE-NEXT: pandn %xmm6, %xmm7
3284 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3285 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3286 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3287 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,0]
3288 ; SSE-NEXT: pand %xmm10, %xmm6
3289 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
3290 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3]
3291 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7]
3292 ; SSE-NEXT: packuswb %xmm8, %xmm8
3293 ; SSE-NEXT: pand %xmm15, %xmm8
3294 ; SSE-NEXT: por %xmm7, %xmm8
3295 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3296 ; SSE-NEXT: pand %xmm11, %xmm2
3297 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3298 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,1,3]
3300 ; SSE-NEXT: pand %xmm10, %xmm6
3301 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
3302 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
3303 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
3304 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7]
3305 ; SSE-NEXT: packuswb %xmm6, %xmm7
3306 ; SSE-NEXT: movdqa %xmm1, %xmm2
3307 ; SSE-NEXT: pandn %xmm7, %xmm2
3308 ; SSE-NEXT: pand %xmm1, %xmm8
3309 ; SSE-NEXT: por %xmm8, %xmm2
3310 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3311 ; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
3312 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3313 ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
3314 ; SSE-NEXT: pand %xmm10, %xmm7
3315 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3]
3316 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
3317 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
3318 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2]
3319 ; SSE-NEXT: packuswb %xmm8, %xmm7
3320 ; SSE-NEXT: movdqa %xmm15, %xmm8
3321 ; SSE-NEXT: pandn %xmm7, %xmm8
3322 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3323 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3324 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[3,1,2,0]
3325 ; SSE-NEXT: pand %xmm10, %xmm7
3326 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
3327 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
3328 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7]
3329 ; SSE-NEXT: packuswb %xmm9, %xmm9
3330 ; SSE-NEXT: pand %xmm15, %xmm9
3331 ; SSE-NEXT: por %xmm8, %xmm9
3332 ; SSE-NEXT: movdqa %xmm11, %xmm2
3333 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3334 ; SSE-NEXT: pand %xmm11, %xmm7
3335 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3336 ; SSE-NEXT: por %xmm7, %xmm2
3337 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,2,1,3]
3338 ; SSE-NEXT: pand %xmm10, %xmm7
3339 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
3340 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
3341 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
3342 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
3343 ; SSE-NEXT: packuswb %xmm7, %xmm8
3344 ; SSE-NEXT: movdqa %xmm1, %xmm7
3345 ; SSE-NEXT: pandn %xmm8, %xmm7
3346 ; SSE-NEXT: pand %xmm1, %xmm9
3347 ; SSE-NEXT: por %xmm9, %xmm7
3348 ; SSE-NEXT: movdqa %xmm0, %xmm8
3349 ; SSE-NEXT: pxor %xmm5, %xmm5
3350 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
3351 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
3352 ; SSE-NEXT: movdqa %xmm0, %xmm9
3353 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
3354 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
3355 ; SSE-NEXT: psrlq $48, %xmm8
3356 ; SSE-NEXT: psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3357 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
3358 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7]
3359 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
3360 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
3361 ; SSE-NEXT: packuswb %xmm9, %xmm8
3362 ; SSE-NEXT: movdqa %xmm15, %xmm10
3363 ; SSE-NEXT: pandn %xmm8, %xmm10
3364 ; SSE-NEXT: movdqa %xmm12, %xmm8
3365 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
3366 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3]
3367 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
3368 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535]
3369 ; SSE-NEXT: movdqa %xmm0, %xmm11
3370 ; SSE-NEXT: pandn %xmm8, %xmm11
3371 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
3372 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7]
3373 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
3374 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7]
3375 ; SSE-NEXT: pand %xmm0, %xmm12
3376 ; SSE-NEXT: por %xmm11, %xmm12
3377 ; SSE-NEXT: packuswb %xmm12, %xmm12
3378 ; SSE-NEXT: pand %xmm15, %xmm12
3379 ; SSE-NEXT: por %xmm10, %xmm12
3380 ; SSE-NEXT: movdqa %xmm13, %xmm8
3381 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
3382 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7]
3383 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
3384 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
3385 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0]
3386 ; SSE-NEXT: movdqa %xmm11, %xmm13
3387 ; SSE-NEXT: pandn %xmm10, %xmm13
3388 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
3389 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1]
3390 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
3391 ; SSE-NEXT: pand %xmm11, %xmm8
3392 ; SSE-NEXT: por %xmm8, %xmm13
3393 ; SSE-NEXT: packuswb %xmm13, %xmm10
3394 ; SSE-NEXT: movdqa %xmm1, %xmm8
3395 ; SSE-NEXT: pandn %xmm10, %xmm8
3396 ; SSE-NEXT: pand %xmm1, %xmm12
3397 ; SSE-NEXT: por %xmm12, %xmm8
3398 ; SSE-NEXT: movdqa %xmm14, %xmm9
3399 ; SSE-NEXT: movdqa %xmm14, %xmm10
3400 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
3401 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
3402 ; SSE-NEXT: movdqa %xmm9, %xmm12
3403 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
3404 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
3405 ; SSE-NEXT: psrlq $48, %xmm10
3406 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3407 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
3408 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
3409 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
3410 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
3411 ; SSE-NEXT: packuswb %xmm12, %xmm10
3412 ; SSE-NEXT: movdqa %xmm15, %xmm12
3413 ; SSE-NEXT: pandn %xmm10, %xmm12
3414 ; SSE-NEXT: movdqa %xmm4, %xmm10
3415 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
3416 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
3417 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
3418 ; SSE-NEXT: movdqa %xmm0, %xmm14
3419 ; SSE-NEXT: pandn %xmm10, %xmm14
3420 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3421 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7]
3422 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
3423 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm10[3,1,1,2,4,5,6,7]
3424 ; SSE-NEXT: pand %xmm0, %xmm13
3425 ; SSE-NEXT: por %xmm14, %xmm13
3426 ; SSE-NEXT: packuswb %xmm13, %xmm13
3427 ; SSE-NEXT: pand %xmm15, %xmm13
3428 ; SSE-NEXT: por %xmm12, %xmm13
3429 ; SSE-NEXT: movdqa %xmm3, %xmm10
3430 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
3431 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,7,5,6,7]
3432 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
3433 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,7,4]
3434 ; SSE-NEXT: movdqa %xmm11, %xmm14
3435 ; SSE-NEXT: pandn %xmm12, %xmm14
3436 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
3437 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,3,1,1]
3438 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3439 ; SSE-NEXT: pand %xmm11, %xmm10
3440 ; SSE-NEXT: por %xmm10, %xmm14
3441 ; SSE-NEXT: packuswb %xmm14, %xmm10
3442 ; SSE-NEXT: movdqa %xmm1, %xmm12
3443 ; SSE-NEXT: pandn %xmm10, %xmm12
3444 ; SSE-NEXT: pand %xmm1, %xmm13
3445 ; SSE-NEXT: por %xmm13, %xmm12
3446 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3447 ; SSE-NEXT: movdqa %xmm9, %xmm10
3448 ; SSE-NEXT: pxor %xmm3, %xmm3
3449 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15]
3450 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
3451 ; SSE-NEXT: pxor %xmm4, %xmm4
3452 ; SSE-NEXT: movdqa %xmm9, %xmm13
3453 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
3454 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
3455 ; SSE-NEXT: psrlq $48, %xmm10
3456 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3457 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
3458 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
3459 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
3460 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
3461 ; SSE-NEXT: packuswb %xmm13, %xmm10
3462 ; SSE-NEXT: movdqa %xmm15, %xmm13
3463 ; SSE-NEXT: pandn %xmm10, %xmm13
3464 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3465 ; SSE-NEXT: movdqa %xmm3, %xmm10
3466 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
3467 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
3468 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
3469 ; SSE-NEXT: movdqa %xmm0, %xmm9
3470 ; SSE-NEXT: pandn %xmm10, %xmm9
3471 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3472 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
3473 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
3474 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm10[3,1,1,2,4,5,6,7]
3475 ; SSE-NEXT: pand %xmm0, %xmm14
3476 ; SSE-NEXT: por %xmm9, %xmm14
3477 ; SSE-NEXT: packuswb %xmm14, %xmm14
3478 ; SSE-NEXT: pand %xmm15, %xmm14
3479 ; SSE-NEXT: por %xmm13, %xmm14
3480 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3481 ; SSE-NEXT: movdqa %xmm3, %xmm9
3482 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
3483 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
3484 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
3485 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
3486 ; SSE-NEXT: movdqa %xmm11, %xmm13
3487 ; SSE-NEXT: pandn %xmm10, %xmm13
3488 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
3489 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1]
3490 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
3491 ; SSE-NEXT: pand %xmm11, %xmm9
3492 ; SSE-NEXT: por %xmm9, %xmm13
3493 ; SSE-NEXT: packuswb %xmm13, %xmm9
3494 ; SSE-NEXT: movdqa %xmm1, %xmm13
3495 ; SSE-NEXT: pandn %xmm9, %xmm13
3496 ; SSE-NEXT: pand %xmm1, %xmm14
3497 ; SSE-NEXT: por %xmm14, %xmm13
3498 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
3499 ; SSE-NEXT: movdqa %xmm3, %xmm9
3500 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
3501 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
3502 ; SSE-NEXT: movdqa %xmm3, %xmm10
3503 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
3504 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
3505 ; SSE-NEXT: psrlq $48, %xmm9
3506 ; SSE-NEXT: psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3507 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
3508 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm3[3,1,2,3,4,5,6,7]
3509 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
3510 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
3511 ; SSE-NEXT: packuswb %xmm10, %xmm9
3512 ; SSE-NEXT: movdqa %xmm6, %xmm10
3513 ; SSE-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
3514 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
3515 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
3516 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
3517 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7]
3518 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
3519 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[3,1,1,2,4,5,6,7]
3520 ; SSE-NEXT: pand %xmm0, %xmm14
3521 ; SSE-NEXT: pandn %xmm10, %xmm0
3522 ; SSE-NEXT: por %xmm14, %xmm0
3523 ; SSE-NEXT: packuswb %xmm0, %xmm0
3524 ; SSE-NEXT: pand %xmm15, %xmm0
3525 ; SSE-NEXT: pandn %xmm9, %xmm15
3526 ; SSE-NEXT: por %xmm15, %xmm0
3527 ; SSE-NEXT: movdqa %xmm2, %xmm4
3528 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3529 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
3530 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
3531 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
3532 ; SSE-NEXT: pand %xmm11, %xmm4
3533 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,7,5,6,7]
3534 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
3535 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
3536 ; SSE-NEXT: pandn %xmm5, %xmm11
3537 ; SSE-NEXT: por %xmm4, %xmm11
3538 ; SSE-NEXT: pand %xmm1, %xmm0
3539 ; SSE-NEXT: packuswb %xmm11, %xmm4
3540 ; SSE-NEXT: pandn %xmm4, %xmm1
3541 ; SSE-NEXT: por %xmm0, %xmm1
3542 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3543 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
3544 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3545 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
3546 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3547 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
3548 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3549 ; SSE-NEXT: movaps %xmm0, (%rsi)
3550 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3551 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
3552 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3553 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
3554 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3555 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
3556 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3557 ; SSE-NEXT: movaps %xmm0, (%rdx)
3558 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3559 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
3560 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3561 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
3562 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3563 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
3564 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3565 ; SSE-NEXT: movaps %xmm0, (%rcx)
3566 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3567 ; SSE-NEXT: movaps %xmm0, 16(%r8)
3568 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3569 ; SSE-NEXT: movaps %xmm0, 32(%r8)
3570 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3571 ; SSE-NEXT: movaps %xmm0, 48(%r8)
3572 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3573 ; SSE-NEXT: movaps %xmm0, (%r8)
3574 ; SSE-NEXT: movdqa %xmm7, 16(%r9)
3575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3576 ; SSE-NEXT: movaps %xmm0, 32(%r9)
3577 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3578 ; SSE-NEXT: movaps %xmm0, 48(%r9)
3579 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3580 ; SSE-NEXT: movaps %xmm0, (%r9)
3581 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3582 ; SSE-NEXT: movdqa %xmm1, 16(%rax)
3583 ; SSE-NEXT: movdqa %xmm13, 32(%rax)
3584 ; SSE-NEXT: movdqa %xmm12, 48(%rax)
3585 ; SSE-NEXT: movdqa %xmm8, (%rax)
3586 ; SSE-NEXT: addq $792, %rsp # imm = 0x318
3589 ; AVX1-ONLY-LABEL: load_i8_stride6_vf64:
3590 ; AVX1-ONLY: # %bb.0:
3591 ; AVX1-ONLY-NEXT: subq $616, %rsp # imm = 0x268
3592 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm2
3593 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm7
3594 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm5
3595 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm6
3596 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm8
3597 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm9
3598 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm10
3599 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm11
3600 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
3601 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3602 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
3603 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
3604 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
3605 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm0
3606 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13
3607 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
3608 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm1
3609 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm15
3610 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3611 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm14
3612 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
3613 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm0
3614 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4
3615 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
3616 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm2
3617 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
3618 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
3619 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3620 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm1
3621 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm2
3622 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
3623 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2
3624 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm3
3625 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3626 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
3627 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3628 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
3629 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm1
3630 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
3631 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
3632 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm15, %xmm2
3633 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
3634 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
3635 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3636 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
3637 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
3638 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2
3639 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm14
3640 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm15
3641 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm3
3642 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm7
3643 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3644 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
3645 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3646 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm9, %xmm1
3647 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3648 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
3649 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
3650 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm10, %xmm2
3651 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm3
3652 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3653 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm1, %xmm2, %xmm0
3654 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3655 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
3656 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
3657 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm0
3658 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm12
3659 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
3660 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
3661 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm1
3662 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm13
3663 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
3664 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0]
3665 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm6, %xmm0
3666 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm4
3667 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14]
3668 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3669 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm2
3670 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
3671 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
3672 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
3673 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3674 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm11, %xmm1
3675 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm10, %xmm2
3676 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, %xmm12
3677 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3678 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm9, %xmm2
3679 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm15
3680 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3681 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3
3682 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
3683 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
3684 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3685 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
3686 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
3687 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3688 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm1
3689 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, %xmm8
3690 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
3691 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3692 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm13
3693 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm2
3694 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm14
3695 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3696 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
3697 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm2
3698 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm9
3699 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm10
3700 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3701 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15]
3702 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm3
3703 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
3704 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
3705 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3706 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm11, %xmm1
3707 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm8
3708 ; AVX1-ONLY-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3709 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm12, %xmm2
3710 ; AVX1-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3711 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3712 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm2
3713 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3714 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm3
3715 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
3716 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
3717 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3718 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
3719 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3720 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm0
3721 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3722 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3723 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
3724 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
3725 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
3726 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3727 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm1
3728 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
3729 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,128,128,128,4,10,0,0,0,128,128,128,4,10]
3730 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
3731 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm1
3732 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3733 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
3734 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm11
3735 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
3736 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3737 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
3738 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3739 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
3740 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
3741 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm4
3742 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
3743 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm10, %xmm5
3744 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
3745 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3746 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm5
3747 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm7, %xmm9
3748 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm9, %xmm5
3749 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
3750 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3751 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm13, %ymm2
3752 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm13, %ymm4
3753 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm4, %ymm9
3754 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm2
3755 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3756 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2
3757 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm4
3758 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3759 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
3760 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
3761 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm2
3762 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3763 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
3764 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm5
3765 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3766 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,2,8,14,128,128,0,0,0,2,8,14,128,128]
3767 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
3768 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm10
3769 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm10, %xmm10
3770 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3771 ; AVX1-ONLY-NEXT: vpblendvb %xmm2, %xmm4, %xmm10, %xmm10
3772 ; AVX1-ONLY-NEXT: vmovdqa %ymm2, %ymm5
3773 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3774 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm9
3775 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
3776 ; AVX1-ONLY-NEXT: vandnps %ymm10, %ymm2, %ymm10
3777 ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm7
3778 ; AVX1-ONLY-NEXT: vorps %ymm10, %ymm9, %ymm4
3779 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3780 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2
3781 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3782 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm9
3783 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm2
3784 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
3785 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm10
3786 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
3787 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm2
3788 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3789 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm10
3790 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm2
3791 ; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3792 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm11
3793 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm11, %xmm10
3794 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
3795 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm10
3796 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3797 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm2, %xmm11
3798 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3799 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm3
3800 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm8, %xmm6
3801 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm6, %xmm3
3802 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm10[3,4,5],xmm3[6,7]
3803 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm13, %ymm6
3804 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm13, %ymm3
3805 ; AVX1-ONLY-NEXT: vmovaps %ymm13, %ymm11
3806 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3
3807 ; AVX1-ONLY-NEXT: vmovdqa 320(%rdi), %xmm4
3808 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3809 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm4, %xmm0
3810 ; AVX1-ONLY-NEXT: vmovdqa 336(%rdi), %xmm4
3811 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm1
3812 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm10
3813 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3814 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3815 ; AVX1-ONLY-NEXT: vmovdqa 368(%rdi), %xmm1
3816 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3817 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[4,10]
3818 ; AVX1-ONLY-NEXT: vmovdqa 352(%rdi), %xmm4
3819 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3820 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
3821 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm6, %xmm1
3822 ; AVX1-ONLY-NEXT: vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
3823 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm1
3824 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3825 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0
3826 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0
3827 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3828 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
3829 ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
3830 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3831 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0
3832 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
3833 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
3834 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3835 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
3836 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
3837 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3838 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,11]
3839 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [0,0,0,3,9,15,128,128,0,0,0,3,9,15,128,128]
3840 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
3841 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3842 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm14
3843 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm14, %xmm1
3844 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm14
3845 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
3846 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3847 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
3848 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
3849 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3850 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm15
3851 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3852 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3853 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm6, %xmm15
3854 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3855 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm13
3856 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13
3857 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3,4,5],xmm13[6,7]
3858 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm11, %ymm13
3859 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm11, %ymm0
3860 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
3861 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3862 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm12, %xmm13
3863 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3864 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm11, %xmm14
3865 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
3866 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3867 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,0,0,128,128,128,5,11,0,0,0,128,128,128,5,11]
3868 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
3869 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm14
3870 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3871 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm15, %xmm15
3872 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14
3873 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3874 ; AVX1-ONLY-NEXT: vpblendvb %xmm15, %xmm13, %xmm14, %xmm13
3875 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0]
3876 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0
3877 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
3878 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13
3879 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
3880 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3881 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3882 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm0, %xmm0
3883 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
3884 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm13
3885 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm13, %xmm0
3886 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3887 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm13, %xmm13
3888 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3889 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm14
3890 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
3891 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0
3892 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3893 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm13
3894 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm14
3895 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
3896 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3897 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm8
3898 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3899 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
3900 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm8, %xmm4
3901 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm13[3,4,5],xmm4[6,7]
3902 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
3903 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm1, %ymm0
3904 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4
3905 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0
3906 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3907 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3
3908 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm1
3909 ; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3910 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3911 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,11]
3912 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3913 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm4
3914 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3915 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
3916 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm3, %xmm1
3917 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm15, %ymm0
3918 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3919 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm15, %ymm1
3920 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
3921 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3922 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
3923 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
3924 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3925 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm0
3926 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
3927 ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
3928 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3929 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm1
3930 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
3931 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,0,0,4,10,128,128,128,0,0,0,4,10,128,128,128]
3932 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
3933 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm4
3934 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,128,128,0,6,12,0,0,0,128,128,0,6,12]
3935 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3936 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3937 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5
3938 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
3939 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
3940 ; AVX1-ONLY-NEXT: vmovdqa %ymm8, %ymm9
3941 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm1
3942 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
3943 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4
3944 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
3945 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm11, %xmm1
3946 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
3947 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm5
3948 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm5[0],xmm1[0]
3949 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3950 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm5
3951 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, %xmm15
3952 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
3953 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
3954 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm1, %xmm5, %xmm5
3955 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
3956 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm4, %ymm4
3957 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
3958 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm1, %ymm5
3959 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm1
3960 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3961 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3962 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[4,10,u,u,u,u,u,u,u,u,u,u,u]
3963 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
3964 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
3965 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
3966 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3967 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm5
3968 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3969 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
3970 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
3971 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
3972 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4
3973 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
3974 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4
3975 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3976 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm1, %xmm5
3977 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm6
3978 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
3979 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3980 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm0
3981 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3
3982 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
3983 ; AVX1-ONLY-NEXT: vpblendvb %xmm9, %xmm5, %xmm0, %xmm0
3984 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
3985 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm4, %ymm3
3986 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3987 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0
3988 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0
3989 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3990 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
3991 ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
3992 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm0
3993 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
3994 ; AVX1-ONLY-NEXT: # xmm13 = mem[0,0]
3995 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3996 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm3
3997 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
3998 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,0,5,11,128,128,128,0,0,0,5,11,128,128,128]
3999 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
4000 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4001 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm4
4002 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,128,128,1,7,13,0,0,0,128,128,1,7,13]
4003 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
4004 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4005 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm6
4006 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
4007 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
4008 ; AVX1-ONLY-NEXT: vmovdqa %ymm9, %ymm13
4009 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0
4010 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
4011 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm4, %ymm0
4012 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
4013 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4014 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm4
4015 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
4016 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4017 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm6, %xmm6
4018 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
4019 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm6
4020 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm7
4021 ; AVX1-ONLY-NEXT: vmovdqa %xmm15, %xmm8
4022 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
4023 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm4, %xmm6, %xmm4
4024 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4025 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm0, %ymm0
4026 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
4027 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4
4028 ; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm7
4029 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0
4030 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4031 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm12[5,11,u,u,u,u,u,u,u,u,u,u,u]
4032 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload
4033 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[3,9,15],zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u,u]
4034 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm0
4035 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm4
4036 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4037 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm15, %xmm6
4038 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
4039 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
4040 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
4041 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4042 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm6
4043 ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm6[0],xmm4[0]
4044 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm3
4045 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4046 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm5
4047 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
4048 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm4, %xmm3, %xmm3
4049 ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm13, %ymm0
4050 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
4051 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
4052 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm0, %ymm0
4053 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2
4054 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2
4055 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0
4056 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4057 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
4058 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
4059 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm14, %xmm0
4060 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
4061 ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
4062 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4063 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm2
4064 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
4065 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,128,128,128,2,8,14,0,0,128,128,128,2,8,14]
4066 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
4067 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm10, %xmm2
4068 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,0,6,12,128,128,128,0,0,0,6,12,128,128,128]
4069 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
4070 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4071 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5
4072 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2
4073 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
4074 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
4075 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload
4076 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm10, %ymm2
4077 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2
4078 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm5
4079 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4080 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm6
4081 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
4082 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm8 = [4,10,0,0,4,10,0,0,4,10,0,0,4,10,0,0]
4083 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4084 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm14, %xmm6
4085 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,2,8,14,0,2,8,14,0,2,8,14,0,2,8,14]
4086 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4087 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm7
4088 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
4089 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4090 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4091 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2
4092 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4093 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm6, %ymm5
4094 ; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm0
4095 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm2, %ymm2
4096 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4097 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm12, %xmm5
4098 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4099 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm1[0,6,12,u,u,u,u,u,u,u,u,u,u,u]
4100 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
4101 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm6
4102 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4103 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm7
4104 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
4105 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
4106 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload
4107 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm10, %ymm5
4108 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5
4109 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3
4110 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4111 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm4
4112 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
4113 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4114 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm4
4115 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4116 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm1, %xmm6
4117 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm6[1],xmm4[1]
4118 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
4119 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm4
4120 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4121 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3
4122 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm2
4123 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4124 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
4125 ; AVX1-ONLY-NEXT: # xmm15 = mem[0,0]
4126 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4127 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm2, %xmm3
4128 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
4129 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
4130 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm5
4131 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
4132 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
4133 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
4134 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4135 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm7
4136 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,1,7,13,128,128,128,0,0,1,7,13,128,128,128]
4137 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
4138 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4139 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm8
4140 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
4141 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3
4142 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
4143 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm10, %ymm3
4144 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3
4145 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4146 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm7
4147 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4148 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm8
4149 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
4150 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [5,11,0,0,5,11,0,0,5,11,0,0,5,11,0,0]
4151 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm14, %xmm8
4152 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,3,9,15,0,3,9,15,0,3,9,15,0,3,9,15]
4153 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4154 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm9
4155 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm9[1],xmm8[1]
4156 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
4157 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4158 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3
4159 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
4160 ; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm2, %ymm7
4161 ; AVX1-ONLY-NEXT: vorps %ymm7, %ymm3, %ymm3
4162 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
4163 ; AVX1-ONLY-NEXT: vpshufb %xmm15, %xmm7, %xmm7
4164 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4165 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm8
4166 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
4167 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4168 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm8
4169 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4170 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm9
4171 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8
4172 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
4173 ; AVX1-ONLY-NEXT: vandnps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
4174 ; AVX1-ONLY-NEXT: vandps %ymm7, %ymm10, %ymm0
4175 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0
4176 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4177 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm5
4178 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm11, %xmm6
4179 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
4180 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm12, %xmm6
4181 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm7
4182 ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm7[1],xmm6[1]
4183 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
4184 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm0, %ymm0
4185 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4186 ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm2, %ymm1
4187 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm0, %ymm0
4188 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4189 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi)
4190 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4191 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi)
4192 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4193 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
4194 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4195 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
4196 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4197 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx)
4198 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4199 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx)
4200 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4201 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
4202 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4203 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8)
4204 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4205 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r9)
4206 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4207 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r9)
4208 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4209 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax)
4210 ; AVX1-ONLY-NEXT: vmovaps %ymm3, (%rax)
4211 ; AVX1-ONLY-NEXT: addq $616, %rsp # imm = 0x268
4212 ; AVX1-ONLY-NEXT: vzeroupper
4213 ; AVX1-ONLY-NEXT: retq
4215 ; AVX2-ONLY-LABEL: load_i8_stride6_vf64:
4216 ; AVX2-ONLY: # %bb.0:
4217 ; AVX2-ONLY-NEXT: subq $328, %rsp # imm = 0x148
4218 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm7
4219 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3
4220 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5
4221 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4222 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0
4223 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1
4224 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0]
4225 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1]
4226 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4227 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
4228 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4229 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm4
4230 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4231 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
4232 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm5, %ymm2
4233 ; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm5
4234 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
4235 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
4236 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm9
4237 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3
4238 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
4239 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
4240 ; AVX2-ONLY-NEXT: vpor %xmm9, %xmm12, %xmm9
4241 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
4242 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm9, %ymm0, %ymm0
4243 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4244 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm8
4245 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm14
4246 ; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4247 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm14, %xmm0
4248 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm14, %xmm15
4249 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm15, %xmm10
4250 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm1
4251 ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm11
4252 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm0
4253 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm9 = ymm0[0,1],ymm11[0,1]
4254 ; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm0[2,3],ymm11[2,3]
4255 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm9, %ymm11, %ymm13
4256 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4257 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
4258 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4259 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
4260 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm1
4261 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
4262 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
4263 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
4264 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
4265 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm4, %ymm4
4266 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
4267 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4268 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm0
4269 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1
4270 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
4271 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm13, %ymm1
4272 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
4273 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4274 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
4275 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4276 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm10, %ymm5, %ymm1
4277 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm2
4278 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
4279 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm3
4280 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
4281 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm5
4282 ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm5
4283 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
4284 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0]
4285 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4286 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
4287 ; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm3, %ymm15
4288 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm15, %ymm5
4289 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4290 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm8, %ymm7, %ymm5
4291 ; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm7
4292 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm15
4293 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6
4294 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4
4295 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4
4296 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm0
4297 ; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm6
4298 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
4299 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4300 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
4301 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2
4302 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
4303 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1
4304 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1
4305 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
4306 ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm3, %ymm3
4307 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm3, %ymm1
4308 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4309 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm15, %xmm1
4310 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3
4311 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
4312 ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm0, %ymm0
4313 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm14
4314 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm0
4315 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm3
4316 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
4317 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm1
4318 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4319 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
4320 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm5
4321 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm15
4322 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm4
4323 ; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm6
4324 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm6, %ymm4, %ymm1
4325 ; AVX2-ONLY-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm12
4326 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm6, %ymm0
4327 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4328 ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
4329 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm10, %ymm0, %ymm8
4330 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4331 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm10
4332 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u]
4333 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4334 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
4335 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm11, %ymm9, %ymm0
4336 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4337 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
4338 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm6
4339 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm11
4340 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
4341 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm11, %xmm9
4342 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6
4343 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
4344 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
4345 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
4346 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4347 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2
4348 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm1, %xmm6
4349 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm0
4350 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0
4351 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4352 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
4353 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4354 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
4355 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm2
4356 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
4357 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm11
4358 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm11, %xmm2
4359 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4360 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
4361 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4362 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
4363 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm6, %xmm1
4364 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
4365 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4366 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm14, %ymm0, %ymm0
4367 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
4368 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm15, %xmm14
4369 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
4370 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm14, %xmm0
4371 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
4372 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1
4373 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm1
4374 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm8, %xmm3
4375 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
4376 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
4377 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
4378 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm8, %xmm11
4379 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm11, %xmm6
4380 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
4381 ; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm13, %ymm0
4382 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm0[5,6,7]
4383 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
4384 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4385 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
4386 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4387 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm12, %xmm1
4388 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm0
4389 ; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm12, %xmm2
4390 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
4391 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm10, %xmm2
4392 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
4393 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm10, %xmm4
4394 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4
4395 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
4396 ; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm6, %ymm5
4397 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
4398 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4399 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4400 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm5
4401 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
4402 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm4
4403 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
4404 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm15, %xmm11
4405 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm11, %xmm4
4406 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
4407 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm3
4408 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
4409 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm8, %xmm8
4410 ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm8, %xmm3
4411 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
4412 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm13, %ymm13
4413 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm13[5,6,7]
4414 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5,6,7]
4415 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
4416 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm4
4417 ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
4418 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm1
4419 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
4420 ; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm1
4421 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm2
4422 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
4423 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm6, %ymm2
4424 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
4425 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4426 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4427 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm3
4428 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4429 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm0
4430 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
4431 ; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm0, %xmm2
4432 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
4433 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm8
4434 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm10
4435 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm8, %xmm2
4436 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4437 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4438 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7],ymm8[8,9,10],ymm2[11,12,13,14,15]
4439 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm2[4,5,6,7]
4440 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
4441 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm8
4442 ; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1
4443 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7
4444 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm11
4445 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm7, %xmm1
4446 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4447 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
4448 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
4449 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
4450 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
4451 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm0
4452 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
4453 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm10, %xmm10
4454 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm10, %xmm0
4455 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4456 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
4457 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
4458 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
4459 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm7
4460 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm11, %xmm8
4461 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
4462 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4463 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
4464 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
4465 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4466 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rsi)
4467 ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rsi)
4468 ; AVX2-ONLY-NEXT: vmovdqa %ymm7, 32(%rdx)
4469 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%rdx)
4470 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4471 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%rcx)
4472 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4473 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%rcx)
4474 ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
4475 ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8)
4476 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4477 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8)
4478 ; AVX2-ONLY-NEXT: vmovdqa %ymm5, 32(%r9)
4479 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4480 ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r9)
4481 ; AVX2-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax
4482 ; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rax)
4483 ; AVX2-ONLY-NEXT: vmovdqa %ymm4, (%rax)
4484 ; AVX2-ONLY-NEXT: addq $328, %rsp # imm = 0x148
4485 ; AVX2-ONLY-NEXT: vzeroupper
4486 ; AVX2-ONLY-NEXT: retq
4488 ; AVX512F-LABEL: load_i8_stride6_vf64:
4490 ; AVX512F-NEXT: subq $40, %rsp
4491 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
4492 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
4493 ; AVX512F-NEXT: vmovdqa64 224(%rdi), %ymm25
4494 ; AVX512F-NEXT: vmovdqa64 192(%rdi), %ymm26
4495 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm0
4496 ; AVX512F-NEXT: vpternlogq $202, %ymm25, %ymm26, %ymm0
4497 ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm3
4498 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
4499 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
4500 ; AVX512F-NEXT: vpshufb %xmm5, %xmm4, %xmm6
4501 ; AVX512F-NEXT: vpor %xmm3, %xmm6, %xmm9
4502 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm30
4503 ; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm31
4504 ; AVX512F-NEXT: vmovdqa64 128(%rdi), %ymm24
4505 ; AVX512F-NEXT: vmovdqa64 160(%rdi), %ymm18
4506 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm6
4507 ; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm18, %ymm6
4508 ; AVX512F-NEXT: vextracti128 $1, %ymm6, %xmm7
4509 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
4510 ; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm10
4511 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
4512 ; AVX512F-NEXT: vpshufb %xmm8, %xmm6, %xmm13
4513 ; AVX512F-NEXT: vpor %xmm10, %xmm13, %xmm10
4514 ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
4515 ; AVX512F-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm2
4516 ; AVX512F-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4517 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm9
4518 ; AVX512F-NEXT: vpternlogq $202, %ymm31, %ymm30, %ymm9
4519 ; AVX512F-NEXT: vpshufb %xmm1, %xmm9, %xmm1
4520 ; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm13
4521 ; AVX512F-NEXT: vpshufb %xmm5, %xmm13, %xmm5
4522 ; AVX512F-NEXT: vporq %xmm1, %xmm5, %xmm17
4523 ; AVX512F-NEXT: vmovdqa64 320(%rdi), %ymm29
4524 ; AVX512F-NEXT: vmovdqa64 352(%rdi), %ymm22
4525 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm1
4526 ; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm1
4527 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
4528 ; AVX512F-NEXT: vpshufb %xmm3, %xmm5, %xmm3
4529 ; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm8
4530 ; AVX512F-NEXT: vpor %xmm3, %xmm8, %xmm3
4531 ; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4532 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
4533 ; AVX512F-NEXT: vpshufb %xmm8, %xmm0, %xmm0
4534 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
4535 ; AVX512F-NEXT: vpshufb %xmm10, %xmm4, %xmm4
4536 ; AVX512F-NEXT: vpor %xmm0, %xmm4, %xmm0
4537 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4538 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
4539 ; AVX512F-NEXT: vpshufb %xmm0, %xmm7, %xmm4
4540 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
4541 ; AVX512F-NEXT: vpshufb %xmm7, %xmm6, %xmm6
4542 ; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm28
4543 ; AVX512F-NEXT: vpshufb %xmm8, %xmm9, %xmm4
4544 ; AVX512F-NEXT: vpshufb %xmm10, %xmm13, %xmm6
4545 ; AVX512F-NEXT: vporq %xmm4, %xmm6, %xmm21
4546 ; AVX512F-NEXT: vpshufb %xmm0, %xmm5, %xmm0
4547 ; AVX512F-NEXT: vpshufb %xmm7, %xmm1, %xmm1
4548 ; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm27
4549 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
4550 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
4551 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm4
4552 ; AVX512F-NEXT: vpternlogq $202, %ymm26, %ymm25, %ymm4
4553 ; AVX512F-NEXT: vextracti128 $1, %ymm4, %xmm15
4554 ; AVX512F-NEXT: vpshufb %xmm0, %xmm15, %xmm1
4555 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
4556 ; AVX512F-NEXT: vpshufb %xmm6, %xmm4, %xmm5
4557 ; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
4558 ; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4559 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm5
4560 ; AVX512F-NEXT: vpternlogq $202, %ymm18, %ymm24, %ymm5
4561 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
4562 ; AVX512F-NEXT: vpshufb %xmm8, %xmm5, %xmm7
4563 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm1
4564 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
4565 ; AVX512F-NEXT: vpshufb %xmm10, %xmm1, %xmm13
4566 ; AVX512F-NEXT: vpor %xmm7, %xmm13, %xmm2
4567 ; AVX512F-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4568 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm13
4569 ; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm13
4570 ; AVX512F-NEXT: vextracti128 $1, %ymm13, %xmm14
4571 ; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm0
4572 ; AVX512F-NEXT: vpshufb %xmm6, %xmm13, %xmm6
4573 ; AVX512F-NEXT: vporq %xmm0, %xmm6, %xmm16
4574 ; AVX512F-NEXT: vmovdqa %ymm12, %ymm11
4575 ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm29, %ymm11
4576 ; AVX512F-NEXT: vpshufb %xmm8, %xmm11, %xmm8
4577 ; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm7
4578 ; AVX512F-NEXT: vpshufb %xmm10, %xmm7, %xmm10
4579 ; AVX512F-NEXT: vpor %xmm8, %xmm10, %xmm0
4580 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4581 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
4582 ; AVX512F-NEXT: vpshufb %xmm10, %xmm15, %xmm15
4583 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
4584 ; AVX512F-NEXT: vpshufb %xmm8, %xmm4, %xmm4
4585 ; AVX512F-NEXT: vpor %xmm4, %xmm15, %xmm0
4586 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,1,7,13],zero,zero,zero,xmm5[5,11],zero,zero,zero
4588 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u],zero,zero,zero,xmm1[3,9,15],zero,zero,xmm1[1,7,13]
4589 ; AVX512F-NEXT: vpor %xmm1, %xmm15, %xmm0
4590 ; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4591 ; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm1
4592 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm19 = ymm1[2,3],mem[2,3]
4593 ; AVX512F-NEXT: vinserti32x4 $1, 288(%rdi), %ymm1, %ymm20
4594 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,0]
4595 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1
4596 ; AVX512F-NEXT: vpternlogq $202, %ymm19, %ymm20, %ymm1
4597 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm15 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
4598 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0,1,2],ymm3[3,4,5,6,7],ymm15[8,9,10],ymm3[11,12,13,14,15]
4599 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5,6,7]
4600 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm23
4601 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} ymm6 = ymm23[2,3],mem[2,3]
4602 ; AVX512F-NEXT: vinserti32x4 $1, 96(%rdi), %ymm23, %ymm23
4603 ; AVX512F-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm15
4604 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2
4605 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm23, %ymm2
4606 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255]
4607 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,8,14,4,10,16,22,28,18,24,30],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4608 ; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm17, %ymm0
4609 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
4610 ; AVX512F-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm0 # 64-byte Folded Reload
4611 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4612 ; AVX512F-NEXT: vpternlogq $184, %zmm0, %zmm3, %zmm15
4613 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
4614 ; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm1
4615 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
4616 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4617 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,9,15,5,11,17,23,29,19,25,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4618 ; AVX512F-NEXT: vpternlogq $248, %ymm4, %ymm21, %ymm1
4619 ; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm2
4620 ; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 16-byte Folded Reload
4621 ; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm17, %zmm1
4622 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm17
4623 ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm3, %zmm17
4624 ; AVX512F-NEXT: vpshufb %xmm10, %xmm14, %xmm0
4625 ; AVX512F-NEXT: vpshufb %xmm8, %xmm13, %xmm1
4626 ; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm21
4627 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,1,7,13],zero,zero,zero,xmm11[5,11],zero,zero,zero
4628 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[u,u,u,u,u],zero,zero,zero,xmm7[3,9,15],zero,zero,xmm7[1,7,13]
4629 ; AVX512F-NEXT: vporq %xmm0, %xmm1, %xmm28
4630 ; AVX512F-NEXT: vmovdqa64 %ymm25, %ymm11
4631 ; AVX512F-NEXT: vpternlogq $226, %ymm26, %ymm12, %ymm11
4632 ; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm0
4633 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
4634 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
4635 ; AVX512F-NEXT: vpshufb %xmm3, %xmm11, %xmm2
4636 ; AVX512F-NEXT: vmovdqa64 %xmm3, %xmm25
4637 ; AVX512F-NEXT: vporq %xmm1, %xmm2, %xmm26
4638 ; AVX512F-NEXT: vmovdqa64 %ymm18, %ymm14
4639 ; AVX512F-NEXT: vpternlogq $226, %ymm24, %ymm9, %ymm14
4640 ; AVX512F-NEXT: vextracti128 $1, %ymm14, %xmm10
4641 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
4642 ; AVX512F-NEXT: vpshufb %xmm1, %xmm10, %xmm2
4643 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
4644 ; AVX512F-NEXT: vpshufb %xmm3, %xmm14, %xmm4
4645 ; AVX512F-NEXT: vporq %xmm2, %xmm4, %xmm27
4646 ; AVX512F-NEXT: vpternlogq $202, %ymm30, %ymm31, %ymm12
4647 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm4
4648 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm6, %ymm4
4649 ; AVX512F-NEXT: vpternlogq $202, %ymm29, %ymm22, %ymm9
4650 ; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm8
4651 ; AVX512F-NEXT: vpshufb %xmm1, %xmm8, %xmm1
4652 ; AVX512F-NEXT: vpshufb %xmm3, %xmm9, %xmm2
4653 ; AVX512F-NEXT: vpor %xmm1, %xmm2, %xmm7
4654 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
4655 ; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
4656 ; AVX512F-NEXT: vmovdqa64 %xmm1, %xmm22
4657 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
4658 ; AVX512F-NEXT: vpshufb %xmm13, %xmm11, %xmm1
4659 ; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm3
4660 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
4661 ; AVX512F-NEXT: vpshufb %xmm2, %xmm10, %xmm1
4662 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
4663 ; AVX512F-NEXT: vpshufb %xmm0, %xmm14, %xmm10
4664 ; AVX512F-NEXT: vpor %xmm1, %xmm10, %xmm10
4665 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
4666 ; AVX512F-NEXT: vpshufb %ymm1, %ymm4, %ymm11
4667 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
4668 ; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm11, %ymm16
4669 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,5,11,1,7,13,19,25,31,21,27,128,128,128,128,128,128,128,128,128,128,128]
4670 ; AVX512F-NEXT: vpshufb %ymm11, %ymm4, %ymm4
4671 ; AVX512F-NEXT: vpternlogq $236, %ymm18, %ymm4, %ymm21
4672 ; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
4673 ; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm5
4674 ; AVX512F-NEXT: vpshufb %ymm1, %ymm5, %ymm1
4675 ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm4, %ymm1
4676 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4
4677 ; AVX512F-NEXT: vpshufb %ymm11, %ymm5, %ymm5
4678 ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm1
4679 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm1[0,6,12],zero,zero,zero,xmm1[4,10,u,u,u,u,u,u]
4680 ; AVX512F-NEXT: vmovdqa64 %xmm25, %xmm14
4681 ; AVX512F-NEXT: vpshufb %xmm14, %xmm12, %xmm14
4682 ; AVX512F-NEXT: vpor %xmm11, %xmm14, %xmm11
4683 ; AVX512F-NEXT: vpshufb %xmm2, %xmm8, %xmm2
4684 ; AVX512F-NEXT: vpshufb %xmm0, %xmm9, %xmm0
4685 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
4686 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
4687 ; AVX512F-NEXT: vpternlogq $226, %ymm23, %ymm2, %ymm6
4688 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28,u,u,u,u,u,u,u,u,u,u,u]
4689 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3,4],xmm8[5,6,7]
4690 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
4691 ; AVX512F-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm9
4692 ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm9, %ymm5
4693 ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
4694 ; AVX512F-NEXT: vpternlogq $202, %ymm20, %ymm19, %ymm2
4695 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4696 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4697 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,0,6,12,2,8,14,20,26,16,22,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4698 ; AVX512F-NEXT: vpternlogq $242, %ymm7, %ymm9, %ymm11
4699 ; AVX512F-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm7
4700 ; AVX512F-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm11
4701 ; AVX512F-NEXT: vinserti32x4 $2, %xmm26, %zmm11, %zmm11
4702 ; AVX512F-NEXT: vpternlogq $226, %zmm11, %zmm9, %zmm8
4703 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
4704 ; AVX512F-NEXT: vpternlogq $184, %zmm8, %zmm11, %zmm7
4705 ; AVX512F-NEXT: vmovdqa64 %xmm22, %xmm8
4706 ; AVX512F-NEXT: vpshufb %xmm8, %xmm1, %xmm1
4707 ; AVX512F-NEXT: vpshufb %xmm13, %xmm12, %xmm8
4708 ; AVX512F-NEXT: vpor %xmm1, %xmm8, %xmm1
4709 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29,u,u,u,u,u,u,u,u,u,u,u]
4710 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
4711 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
4712 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4713 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,1,7,13,3,9,15,21,27,17,23,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4714 ; AVX512F-NEXT: vpternlogq $242, %ymm0, %ymm9, %ymm2
4715 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
4716 ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm2
4717 ; AVX512F-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm2
4718 ; AVX512F-NEXT: vpternlogq $226, %zmm2, %zmm9, %zmm1
4719 ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm11, %zmm0
4720 ; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
4721 ; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
4722 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
4723 ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm16
4724 ; AVX512F-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
4725 ; AVX512F-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 16-byte Folded Reload
4726 ; AVX512F-NEXT: vpternlogq $184, %zmm1, %zmm2, %zmm21
4727 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
4728 ; AVX512F-NEXT: vpternlogq $184, %zmm16, %zmm1, %zmm4
4729 ; AVX512F-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm5
4730 ; AVX512F-NEXT: vmovdqa64 %zmm15, (%rsi)
4731 ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx)
4732 ; AVX512F-NEXT: vmovdqa64 %zmm4, (%rcx)
4733 ; AVX512F-NEXT: vmovdqa64 %zmm5, (%r8)
4734 ; AVX512F-NEXT: vmovdqa64 %zmm7, (%r9)
4735 ; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
4736 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%rax)
4737 ; AVX512F-NEXT: addq $40, %rsp
4738 ; AVX512F-NEXT: vzeroupper
4739 ; AVX512F-NEXT: retq
4741 ; AVX512BW-LABEL: load_i8_stride6_vf64:
4742 ; AVX512BW: # %bb.0:
4743 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4744 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
4745 ; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm0
4746 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm23
4747 ; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924
4748 ; AVX512BW-NEXT: kmovd %r10d, %k1
4749 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
4750 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm1
4751 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
4752 ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12
4753 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm12, %xmm3
4754 ; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm5
4755 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10
4756 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
4757 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm6
4758 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %ymm26
4759 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1
4760 ; AVX512BW-NEXT: vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
4761 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
4762 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
4763 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm11
4764 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
4765 ; AVX512BW-NEXT: vpshufb %xmm18, %xmm15, %xmm13
4766 ; AVX512BW-NEXT: vpor %xmm11, %xmm13, %xmm11
4767 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
4768 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm11, %zmm11
4769 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm6[2,3],mem[2,3]
4770 ; AVX512BW-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm13
4771 ; AVX512BW-NEXT: movw $-28124, %r10w # imm = 0x9224
4772 ; AVX512BW-NEXT: kmovd %r10d, %k4
4773 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm13, %ymm19 {%k4}
4774 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm10, %ymm20 {%k1}
4775 ; AVX512BW-NEXT: vpshufb %xmm2, %xmm20, %xmm2
4776 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21
4777 ; AVX512BW-NEXT: vpshufb %xmm4, %xmm21, %xmm4
4778 ; AVX512BW-NEXT: vpor %xmm2, %xmm4, %xmm2
4779 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10]
4780 ; AVX512BW-NEXT: movl $4192256, %r10d # imm = 0x3FF800
4781 ; AVX512BW-NEXT: kmovd %r10d, %k2
4782 ; AVX512BW-NEXT: vpshufb %ymm6, %ymm19, %ymm2 {%k2}
4783 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm2 {%k2}
4784 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm11
4785 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm11[2,3],mem[2,3]
4786 ; AVX512BW-NEXT: vinserti128 $1, 288(%rdi), %ymm11, %ymm14
4787 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm14, %ymm22 {%k4}
4788 ; AVX512BW-NEXT: vpshufb %ymm6, %ymm22, %ymm7
4789 ; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm11
4790 ; AVX512BW-NEXT: vmovdqa 352(%rdi), %ymm6
4791 ; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm24 {%k1}
4792 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm24, %xmm25
4793 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm25, %xmm17
4794 ; AVX512BW-NEXT: vpshufb %xmm18, %xmm24, %xmm18
4795 ; AVX512BW-NEXT: vporq %xmm17, %xmm18, %xmm17
4796 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8
4797 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
4798 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
4799 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
4800 ; AVX512BW-NEXT: movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
4801 ; AVX512BW-NEXT: kmovq %rdi, %k3
4802 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm2 {%k3}
4803 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
4804 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm9, %xmm8
4805 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
4806 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm12, %xmm12
4807 ; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8
4808 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
4809 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm16, %xmm16
4810 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
4811 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm15, %xmm15
4812 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
4813 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
4814 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm8, %zmm15, %zmm8
4815 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm7
4816 ; AVX512BW-NEXT: vpshufb %xmm9, %xmm21, %xmm9
4817 ; AVX512BW-NEXT: vpor %xmm7, %xmm9, %xmm9
4818 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
4819 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm19, %ymm9 {%k2}
4820 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k2}
4821 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm22, %ymm7
4822 ; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm8
4823 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm12
4824 ; AVX512BW-NEXT: vpor %xmm8, %xmm12, %xmm8
4825 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
4826 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm8[3,4,5,6,7],ymm7[8,9,10],ymm8[11,12,13,14,15]
4827 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
4828 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
4829 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm9 {%k3}
4830 ; AVX512BW-NEXT: vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
4831 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
4832 ; AVX512BW-NEXT: movw $9362, %di # imm = 0x2492
4833 ; AVX512BW-NEXT: kmovd %edi, %k2
4834 ; AVX512BW-NEXT: vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
4835 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm8, %xmm16
4836 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm16, %xmm12
4837 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
4838 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm8, %xmm18
4839 ; AVX512BW-NEXT: vporq %xmm12, %xmm18, %xmm18
4840 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
4841 ; AVX512BW-NEXT: movl $2095104, %edi # imm = 0x1FF800
4842 ; AVX512BW-NEXT: kmovd %edi, %k5
4843 ; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm18 {%k5}
4844 ; AVX512BW-NEXT: vpblendmw %ymm23, %ymm0, %ymm20 {%k2}
4845 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm21
4846 ; AVX512BW-NEXT: vpshufb %xmm7, %xmm21, %xmm7
4847 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm20, %xmm12
4848 ; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7
4849 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
4850 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
4851 ; AVX512BW-NEXT: vpshufb %xmm22, %xmm17, %xmm12
4852 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm24
4853 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
4854 ; AVX512BW-NEXT: vpshufb %xmm25, %xmm24, %xmm27
4855 ; AVX512BW-NEXT: vporq %xmm12, %xmm27, %xmm12
4856 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
4857 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm7, %zmm12, %zmm12
4858 ; AVX512BW-NEXT: movl $2097151, %edi # imm = 0x1FFFFF
4859 ; AVX512BW-NEXT: kmovq %rdi, %k6
4860 ; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm12 {%k6}
4861 ; AVX512BW-NEXT: vpblendmw %ymm14, %ymm4, %ymm7 {%k4}
4862 ; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm18 {%k1}
4863 ; AVX512BW-NEXT: vpshufb %xmm22, %xmm18, %xmm22
4864 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm18, %xmm27
4865 ; AVX512BW-NEXT: vpshufb %xmm25, %xmm27, %xmm25
4866 ; AVX512BW-NEXT: vporq %xmm22, %xmm25, %xmm22
4867 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm22
4868 ; AVX512BW-NEXT: vpshufb %ymm19, %ymm7, %ymm22 {%k5}
4869 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm19
4870 ; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm12 {%k3}
4871 ; AVX512BW-NEXT: movw $9289, %di # imm = 0x2449
4872 ; AVX512BW-NEXT: kmovd %edi, %k4
4873 ; AVX512BW-NEXT: vmovdqu16 %ymm14, %ymm4 {%k4}
4874 ; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm5 {%k4}
4875 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
4876 ; AVX512BW-NEXT: vpshufb %xmm13, %xmm16, %xmm14
4877 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
4878 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm8, %xmm8
4879 ; AVX512BW-NEXT: vpor %xmm14, %xmm8, %xmm8
4880 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
4881 ; AVX512BW-NEXT: vpshufb %ymm14, %ymm15, %ymm8 {%k5}
4882 ; AVX512BW-NEXT: vpshufb %xmm13, %xmm21, %xmm13
4883 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm20, %xmm15
4884 ; AVX512BW-NEXT: vpor %xmm13, %xmm15, %xmm13
4885 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
4886 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm17, %xmm16
4887 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
4888 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm24, %xmm19
4889 ; AVX512BW-NEXT: vporq %xmm16, %xmm19, %xmm16
4890 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
4891 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm16, %zmm13
4892 ; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm13 {%k6}
4893 ; AVX512BW-NEXT: vpshufb %xmm15, %xmm18, %xmm8
4894 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm27, %xmm15
4895 ; AVX512BW-NEXT: vpor %xmm8, %xmm15, %xmm8
4896 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
4897 ; AVX512BW-NEXT: vpshufb %ymm14, %ymm7, %ymm8 {%k5}
4898 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm7
4899 ; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm13 {%k3}
4900 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
4901 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8
4902 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
4903 ; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm3 {%k1}
4904 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm15
4905 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm10
4906 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
4907 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm3, %xmm17
4908 ; AVX512BW-NEXT: vporq %xmm10, %xmm17, %xmm10
4909 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
4910 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
4911 ; AVX512BW-NEXT: vmovdqu16 %ymm23, %ymm0 {%k1}
4912 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm8
4913 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14
4914 ; AVX512BW-NEXT: vpshufb %xmm16, %xmm0, %xmm16
4915 ; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14
4916 ; AVX512BW-NEXT: vmovdqu16 %ymm26, %ymm1 {%k2}
4917 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm1, %xmm16
4918 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
4919 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm16, %xmm18
4920 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
4921 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm1, %xmm20
4922 ; AVX512BW-NEXT: vporq %xmm18, %xmm20, %xmm18
4923 ; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
4924 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm18, %zmm14
4925 ; AVX512BW-NEXT: movabsq $4398044413952, %rdi # imm = 0x3FFFFE00000
4926 ; AVX512BW-NEXT: kmovq %rdi, %k1
4927 ; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm10 {%k1}
4928 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm7
4929 ; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k2}
4930 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm11
4931 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm14
4932 ; AVX512BW-NEXT: vpshufb %xmm19, %xmm6, %xmm17
4933 ; AVX512BW-NEXT: vporq %xmm14, %xmm17, %xmm14
4934 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
4935 ; AVX512BW-NEXT: movl $-2097152, %edi # imm = 0xFFE00000
4936 ; AVX512BW-NEXT: kmovd %edi, %k2
4937 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm7 {%k2}
4938 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
4939 ; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm10 {%k2}
4940 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
4941 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm5
4942 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
4943 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm15, %xmm15
4944 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
4945 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm3, %xmm3
4946 ; AVX512BW-NEXT: vpor %xmm3, %xmm15, %xmm3
4947 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
4948 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
4949 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm5
4950 ; AVX512BW-NEXT: vpshufb %xmm17, %xmm0, %xmm0
4951 ; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
4952 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
4953 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm16, %xmm8
4954 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
4955 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm1, %xmm1
4956 ; AVX512BW-NEXT: vpor %xmm1, %xmm8, %xmm1
4957 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4958 ; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
4959 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
4960 ; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm0
4961 ; AVX512BW-NEXT: vpshufb %xmm5, %xmm11, %xmm1
4962 ; AVX512BW-NEXT: vpshufb %xmm14, %xmm6, %xmm4
4963 ; AVX512BW-NEXT: vpor %xmm1, %xmm4, %xmm1
4964 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4965 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
4966 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
4967 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2}
4968 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
4969 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
4970 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx)
4971 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r8)
4972 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r9)
4973 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
4974 ; AVX512BW-NEXT: vzeroupper
4975 ; AVX512BW-NEXT: retq
4976 %wide.vec = load <384 x i8>, ptr %in.vec, align 64
4977 %strided.vec0 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 0, i32 6, i32 12, i32 18, i32 24, i32 30, i32 36, i32 42, i32 48, i32 54, i32 60, i32 66, i32 72, i32 78, i32 84, i32 90, i32 96, i32 102, i32 108, i32 114, i32 120, i32 126, i32 132, i32 138, i32 144, i32 150, i32 156, i32 162, i32 168, i32 174, i32 180, i32 186, i32 192, i32 198, i32 204, i32 210, i32 216, i32 222, i32 228, i32 234, i32 240, i32 246, i32 252, i32 258, i32 264, i32 270, i32 276, i32 282, i32 288, i32 294, i32 300, i32 306, i32 312, i32 318, i32 324, i32 330, i32 336, i32 342, i32 348, i32 354, i32 360, i32 366, i32 372, i32 378>
4978 %strided.vec1 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 1, i32 7, i32 13, i32 19, i32 25, i32 31, i32 37, i32 43, i32 49, i32 55, i32 61, i32 67, i32 73, i32 79, i32 85, i32 91, i32 97, i32 103, i32 109, i32 115, i32 121, i32 127, i32 133, i32 139, i32 145, i32 151, i32 157, i32 163, i32 169, i32 175, i32 181, i32 187, i32 193, i32 199, i32 205, i32 211, i32 217, i32 223, i32 229, i32 235, i32 241, i32 247, i32 253, i32 259, i32 265, i32 271, i32 277, i32 283, i32 289, i32 295, i32 301, i32 307, i32 313, i32 319, i32 325, i32 331, i32 337, i32 343, i32 349, i32 355, i32 361, i32 367, i32 373, i32 379>
4979 %strided.vec2 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 2, i32 8, i32 14, i32 20, i32 26, i32 32, i32 38, i32 44, i32 50, i32 56, i32 62, i32 68, i32 74, i32 80, i32 86, i32 92, i32 98, i32 104, i32 110, i32 116, i32 122, i32 128, i32 134, i32 140, i32 146, i32 152, i32 158, i32 164, i32 170, i32 176, i32 182, i32 188, i32 194, i32 200, i32 206, i32 212, i32 218, i32 224, i32 230, i32 236, i32 242, i32 248, i32 254, i32 260, i32 266, i32 272, i32 278, i32 284, i32 290, i32 296, i32 302, i32 308, i32 314, i32 320, i32 326, i32 332, i32 338, i32 344, i32 350, i32 356, i32 362, i32 368, i32 374, i32 380>
4980 %strided.vec3 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 3, i32 9, i32 15, i32 21, i32 27, i32 33, i32 39, i32 45, i32 51, i32 57, i32 63, i32 69, i32 75, i32 81, i32 87, i32 93, i32 99, i32 105, i32 111, i32 117, i32 123, i32 129, i32 135, i32 141, i32 147, i32 153, i32 159, i32 165, i32 171, i32 177, i32 183, i32 189, i32 195, i32 201, i32 207, i32 213, i32 219, i32 225, i32 231, i32 237, i32 243, i32 249, i32 255, i32 261, i32 267, i32 273, i32 279, i32 285, i32 291, i32 297, i32 303, i32 309, i32 315, i32 321, i32 327, i32 333, i32 339, i32 345, i32 351, i32 357, i32 363, i32 369, i32 375, i32 381>
4981 %strided.vec4 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 4, i32 10, i32 16, i32 22, i32 28, i32 34, i32 40, i32 46, i32 52, i32 58, i32 64, i32 70, i32 76, i32 82, i32 88, i32 94, i32 100, i32 106, i32 112, i32 118, i32 124, i32 130, i32 136, i32 142, i32 148, i32 154, i32 160, i32 166, i32 172, i32 178, i32 184, i32 190, i32 196, i32 202, i32 208, i32 214, i32 220, i32 226, i32 232, i32 238, i32 244, i32 250, i32 256, i32 262, i32 268, i32 274, i32 280, i32 286, i32 292, i32 298, i32 304, i32 310, i32 316, i32 322, i32 328, i32 334, i32 340, i32 346, i32 352, i32 358, i32 364, i32 370, i32 376, i32 382>
4982 %strided.vec5 = shufflevector <384 x i8> %wide.vec, <384 x i8> poison, <64 x i32> <i32 5, i32 11, i32 17, i32 23, i32 29, i32 35, i32 41, i32 47, i32 53, i32 59, i32 65, i32 71, i32 77, i32 83, i32 89, i32 95, i32 101, i32 107, i32 113, i32 119, i32 125, i32 131, i32 137, i32 143, i32 149, i32 155, i32 161, i32 167, i32 173, i32 179, i32 185, i32 191, i32 197, i32 203, i32 209, i32 215, i32 221, i32 227, i32 233, i32 239, i32 245, i32 251, i32 257, i32 263, i32 269, i32 275, i32 281, i32 287, i32 293, i32 299, i32 305, i32 311, i32 317, i32 323, i32 329, i32 335, i32 341, i32 347, i32 353, i32 359, i32 365, i32 371, i32 377, i32 383>
4983 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
4984 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
4985 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
4986 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
4987 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
4988 store <64 x i8> %strided.vec5, ptr %out.vec5, align 64
4991 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
4994 ; AVX2-FAST-PERLANE: {{.*}}
4997 ; AVX512-FAST: {{.*}}
4998 ; AVX512-SLOW: {{.*}}
4999 ; AVX512BW-FAST: {{.*}}
5000 ; AVX512BW-ONLY: {{.*}}
5001 ; AVX512BW-ONLY-FAST: {{.*}}
5002 ; AVX512BW-ONLY-SLOW: {{.*}}
5003 ; AVX512BW-SLOW: {{.*}}
5004 ; AVX512DQ-FAST: {{.*}}
5005 ; AVX512DQ-ONLY: {{.*}}
5006 ; AVX512DQ-SLOW: {{.*}}
5007 ; AVX512DQBW-FAST: {{.*}}
5008 ; AVX512DQBW-ONLY: {{.*}}
5009 ; AVX512DQBW-SLOW: {{.*}}
5010 ; AVX512F-FAST: {{.*}}
5011 ; AVX512F-ONLY: {{.*}}
5012 ; AVX512F-ONLY-FAST: {{.*}}
5013 ; AVX512F-ONLY-SLOW: {{.*}}
5014 ; AVX512F-SLOW: {{.*}}
5017 ; FALLBACK10: {{.*}}
5018 ; FALLBACK11: {{.*}}
5019 ; FALLBACK12: {{.*}}