1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,FALLBACK0
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-ONLY,FALLBACK1
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i8_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
19 ; SSE-LABEL: load_i8_stride5_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm1
22 ; SSE-NEXT: pxor %xmm2, %xmm2
23 ; SSE-NEXT: movdqa %xmm1, %xmm0
24 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
27 ; SSE-NEXT: packuswb %xmm3, %xmm3
28 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,3,2,3]
29 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7]
30 ; SSE-NEXT: packuswb %xmm4, %xmm4
31 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[3,1,2,3]
32 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
33 ; SSE-NEXT: packuswb %xmm5, %xmm5
34 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
35 ; SSE-NEXT: psrlq $48, %xmm0
36 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
37 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
38 ; SSE-NEXT: packuswb %xmm0, %xmm0
39 ; SSE-NEXT: psrld $16, %xmm1
40 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
41 ; SSE-NEXT: packuswb %xmm6, %xmm6
42 ; SSE-NEXT: movd %xmm3, %eax
43 ; SSE-NEXT: movw %ax, (%rsi)
44 ; SSE-NEXT: movd %xmm4, %eax
45 ; SSE-NEXT: movw %ax, (%rdx)
46 ; SSE-NEXT: movd %xmm5, %eax
47 ; SSE-NEXT: movw %ax, (%rcx)
48 ; SSE-NEXT: movd %xmm0, %eax
49 ; SSE-NEXT: movw %ax, (%r8)
50 ; SSE-NEXT: movd %xmm6, %eax
51 ; SSE-NEXT: movw %ax, (%r9)
54 ; AVX-LABEL: load_i8_stride5_vf2:
56 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
57 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
58 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[1,6,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
59 ; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
60 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[3,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
61 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
62 ; AVX-NEXT: vpextrw $0, %xmm1, (%rsi)
63 ; AVX-NEXT: vpextrw $0, %xmm2, (%rdx)
64 ; AVX-NEXT: vpextrw $0, %xmm3, (%rcx)
65 ; AVX-NEXT: vpextrw $0, %xmm4, (%r8)
66 ; AVX-NEXT: vpextrw $0, %xmm0, (%r9)
68 %wide.vec = load <10 x i8>, ptr %in.vec, align 64
69 %strided.vec0 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 0, i32 5>
70 %strided.vec1 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 1, i32 6>
71 %strided.vec2 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 2, i32 7>
72 %strided.vec3 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 3, i32 8>
73 %strided.vec4 = shufflevector <10 x i8> %wide.vec, <10 x i8> poison, <2 x i32> <i32 4, i32 9>
74 store <2 x i8> %strided.vec0, ptr %out.vec0, align 64
75 store <2 x i8> %strided.vec1, ptr %out.vec1, align 64
76 store <2 x i8> %strided.vec2, ptr %out.vec2, align 64
77 store <2 x i8> %strided.vec3, ptr %out.vec3, align 64
78 store <2 x i8> %strided.vec4, ptr %out.vec4, align 64
82 define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
83 ; SSE-LABEL: load_i8_stride5_vf4:
85 ; SSE-NEXT: movdqa (%rdi), %xmm5
86 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
87 ; SSE-NEXT: pxor %xmm4, %xmm4
88 ; SSE-NEXT: movdqa %xmm5, %xmm2
89 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
90 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
91 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm1[2,1,2,3,4,5,6,7]
92 ; SSE-NEXT: movdqa %xmm5, %xmm3
93 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
94 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
95 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
96 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
97 ; SSE-NEXT: packuswb %xmm1, %xmm1
98 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
99 ; SSE-NEXT: movdqa %xmm5, %xmm7
100 ; SSE-NEXT: pand %xmm6, %xmm7
101 ; SSE-NEXT: pandn %xmm0, %xmm6
102 ; SSE-NEXT: por %xmm7, %xmm6
103 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
104 ; SSE-NEXT: movdqa %xmm2, %xmm7
105 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm6[0,0]
106 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,3]
107 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1,1,3]
108 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,7]
109 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
110 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,2,3,0,4,5,6,7]
111 ; SSE-NEXT: packuswb %xmm6, %xmm6
112 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,65535]
113 ; SSE-NEXT: movdqa %xmm5, %xmm8
114 ; SSE-NEXT: pand %xmm7, %xmm8
115 ; SSE-NEXT: pandn %xmm0, %xmm7
116 ; SSE-NEXT: por %xmm8, %xmm7
117 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
118 ; SSE-NEXT: movdqa %xmm2, %xmm8
119 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0]
120 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2]
121 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
122 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
123 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
124 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
125 ; SSE-NEXT: packuswb %xmm7, %xmm7
126 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
127 ; SSE-NEXT: pand %xmm8, %xmm5
128 ; SSE-NEXT: pandn %xmm0, %xmm8
129 ; SSE-NEXT: por %xmm5, %xmm8
130 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
131 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,1],xmm2[2,0]
132 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,7]
133 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
134 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,0,1,2,4,5,6,7]
135 ; SSE-NEXT: packuswb %xmm4, %xmm4
136 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[3,0]
137 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0,2]
138 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
139 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
140 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7]
141 ; SSE-NEXT: packuswb %xmm2, %xmm2
142 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
143 ; SSE-NEXT: pand %xmm3, %xmm2
144 ; SSE-NEXT: pandn %xmm0, %xmm3
145 ; SSE-NEXT: por %xmm2, %xmm3
146 ; SSE-NEXT: movd %xmm1, (%rsi)
147 ; SSE-NEXT: movd %xmm6, (%rdx)
148 ; SSE-NEXT: movd %xmm7, (%rcx)
149 ; SSE-NEXT: movd %xmm4, (%r8)
150 ; SSE-NEXT: movd %xmm3, (%r9)
153 ; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
154 ; AVX1-ONLY: # %bb.0:
155 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
156 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
157 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
158 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3
159 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
160 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
161 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
162 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
163 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
164 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6
165 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
166 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
167 ; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi)
168 ; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
169 ; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
170 ; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8)
171 ; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
172 ; AVX1-ONLY-NEXT: retq
174 ; AVX2-LABEL: load_i8_stride5_vf4:
176 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
177 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
178 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
179 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
180 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
181 ; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
182 ; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
183 ; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
184 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
185 ; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
186 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
187 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
188 ; AVX2-NEXT: vmovd %xmm3, (%rsi)
189 ; AVX2-NEXT: vmovd %xmm4, (%rdx)
190 ; AVX2-NEXT: vmovd %xmm5, (%rcx)
191 ; AVX2-NEXT: vmovd %xmm6, (%r8)
192 ; AVX2-NEXT: vmovd %xmm0, (%r9)
194 %wide.vec = load <20 x i8>, ptr %in.vec, align 64
195 %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
196 %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
197 %strided.vec2 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17>
198 %strided.vec3 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18>
199 %strided.vec4 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19>
200 store <4 x i8> %strided.vec0, ptr %out.vec0, align 64
201 store <4 x i8> %strided.vec1, ptr %out.vec1, align 64
202 store <4 x i8> %strided.vec2, ptr %out.vec2, align 64
203 store <4 x i8> %strided.vec3, ptr %out.vec3, align 64
204 store <4 x i8> %strided.vec4, ptr %out.vec4, align 64
208 define void @load_i8_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
209 ; SSE-LABEL: load_i8_stride5_vf8:
211 ; SSE-NEXT: movdqa (%rdi), %xmm4
212 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
213 ; SSE-NEXT: movdqa 32(%rdi), %xmm0
214 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
215 ; SSE-NEXT: movdqa %xmm1, %xmm2
216 ; SSE-NEXT: pandn %xmm3, %xmm2
217 ; SSE-NEXT: movdqa %xmm4, %xmm5
218 ; SSE-NEXT: pand %xmm1, %xmm5
219 ; SSE-NEXT: por %xmm2, %xmm5
220 ; SSE-NEXT: pxor %xmm6, %xmm6
221 ; SSE-NEXT: movdqa %xmm5, %xmm2
222 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
223 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
224 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
225 ; SSE-NEXT: pand %xmm7, %xmm5
226 ; SSE-NEXT: pandn %xmm2, %xmm7
227 ; SSE-NEXT: por %xmm5, %xmm7
228 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[0,2,1,3,4,5,6,7]
229 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
230 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
231 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
232 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,6,5,7]
233 ; SSE-NEXT: packuswb %xmm7, %xmm7
234 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
235 ; SSE-NEXT: pand %xmm2, %xmm7
236 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
237 ; SSE-NEXT: movdqa %xmm2, %xmm5
238 ; SSE-NEXT: pandn %xmm8, %xmm5
239 ; SSE-NEXT: por %xmm7, %xmm5
240 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
241 ; SSE-NEXT: movdqa %xmm3, %xmm8
242 ; SSE-NEXT: pand %xmm7, %xmm8
243 ; SSE-NEXT: pandn %xmm4, %xmm7
244 ; SSE-NEXT: por %xmm8, %xmm7
245 ; SSE-NEXT: movdqa %xmm7, %xmm8
246 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
247 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,0,0,65535,65535,65535,0]
248 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
249 ; SSE-NEXT: pand %xmm9, %xmm7
250 ; SSE-NEXT: pandn %xmm8, %xmm9
251 ; SSE-NEXT: por %xmm7, %xmm9
252 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,2,1,3]
253 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,5]
254 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
255 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
256 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,5,7]
257 ; SSE-NEXT: packuswb %xmm7, %xmm7
258 ; SSE-NEXT: pand %xmm2, %xmm7
259 ; SSE-NEXT: movdqa %xmm0, %xmm8
260 ; SSE-NEXT: pslld $24, %xmm8
261 ; SSE-NEXT: pandn %xmm8, %xmm2
262 ; SSE-NEXT: por %xmm7, %xmm2
263 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
264 ; SSE-NEXT: movdqa %xmm3, %xmm8
265 ; SSE-NEXT: pand %xmm7, %xmm8
266 ; SSE-NEXT: pandn %xmm4, %xmm7
267 ; SSE-NEXT: por %xmm8, %xmm7
268 ; SSE-NEXT: movdqa %xmm7, %xmm9
269 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
270 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
271 ; SSE-NEXT: movdqa %xmm8, %xmm10
272 ; SSE-NEXT: pandn %xmm9, %xmm10
273 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
274 ; SSE-NEXT: pand %xmm8, %xmm7
275 ; SSE-NEXT: por %xmm10, %xmm7
276 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
277 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
278 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
279 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,1,4,5,6,7]
280 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,5,6,7]
281 ; SSE-NEXT: packuswb %xmm10, %xmm10
282 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
283 ; SSE-NEXT: pand %xmm7, %xmm10
284 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
285 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,2,0]
286 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5,6,5]
287 ; SSE-NEXT: packuswb %xmm11, %xmm11
288 ; SSE-NEXT: movdqa %xmm7, %xmm9
289 ; SSE-NEXT: pandn %xmm11, %xmm9
290 ; SSE-NEXT: por %xmm10, %xmm9
291 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
292 ; SSE-NEXT: movdqa %xmm3, %xmm11
293 ; SSE-NEXT: pand %xmm10, %xmm11
294 ; SSE-NEXT: pandn %xmm4, %xmm10
295 ; SSE-NEXT: por %xmm11, %xmm10
296 ; SSE-NEXT: movdqa %xmm10, %xmm11
297 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
298 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
299 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm11[2,0]
300 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,6,5]
301 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[3,1,2,0]
302 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[3,0,1,2,4,5,6,7]
303 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,7,4,6,7]
304 ; SSE-NEXT: packuswb %xmm11, %xmm11
305 ; SSE-NEXT: pand %xmm7, %xmm11
306 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,3]
307 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm10[0,1,2,3,4,5,5,6]
308 ; SSE-NEXT: packuswb %xmm12, %xmm12
309 ; SSE-NEXT: movdqa %xmm7, %xmm10
310 ; SSE-NEXT: pandn %xmm12, %xmm10
311 ; SSE-NEXT: por %xmm11, %xmm10
312 ; SSE-NEXT: pand %xmm1, %xmm3
313 ; SSE-NEXT: pandn %xmm4, %xmm1
314 ; SSE-NEXT: por %xmm3, %xmm1
315 ; SSE-NEXT: movdqa %xmm1, %xmm3
316 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
317 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
318 ; SSE-NEXT: pand %xmm8, %xmm1
319 ; SSE-NEXT: pandn %xmm3, %xmm8
320 ; SSE-NEXT: por %xmm1, %xmm8
321 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[3,1,0,3,4,5,6,7]
322 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
323 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
324 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
325 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
326 ; SSE-NEXT: packuswb %xmm1, %xmm1
327 ; SSE-NEXT: pand %xmm7, %xmm1
328 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
329 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
330 ; SSE-NEXT: packuswb %xmm0, %xmm0
331 ; SSE-NEXT: pandn %xmm0, %xmm7
332 ; SSE-NEXT: por %xmm1, %xmm7
333 ; SSE-NEXT: movq %xmm5, (%rsi)
334 ; SSE-NEXT: movq %xmm2, (%rdx)
335 ; SSE-NEXT: movq %xmm9, (%rcx)
336 ; SSE-NEXT: movq %xmm10, (%r8)
337 ; SSE-NEXT: movq %xmm7, (%r9)
340 ; AVX1-ONLY-LABEL: load_i8_stride5_vf8:
341 ; AVX1-ONLY: # %bb.0:
342 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
343 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
344 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm2
345 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
346 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
347 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
348 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
349 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,7,u,u,u,u,u,u,u,u]
350 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
351 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[0,5,10,15,u,u,u,u,u,u,u,u,u]
352 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
353 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
354 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,2,4,6,8,10,12,9,u,u,u,u,u,u,u,u]
355 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11,u,u,u,u,u,u,u,u,u,u]
356 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
357 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
358 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
359 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,2,4,6,8,10,1,11,u,u,u,u,u,u,u,u]
360 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u,u,u,u]
361 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
362 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
363 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
364 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,2,4,6,8,10,3,13,u,u,u,u,u,u,u,u]
365 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13,u,u,u,u,u,u,u,u,u,u]
366 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
367 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
368 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
369 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,5,15,u,u,u,u,u,u,u,u]
370 ; AVX1-ONLY-NEXT: vmovq %xmm3, (%rsi)
371 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rdx)
372 ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rcx)
373 ; AVX1-ONLY-NEXT: vmovq %xmm6, (%r8)
374 ; AVX1-ONLY-NEXT: vmovq %xmm0, (%r9)
375 ; AVX1-ONLY-NEXT: retq
377 ; AVX2-LABEL: load_i8_stride5_vf8:
379 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
380 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
381 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
382 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm1[4,9,14],zero,xmm1[u,u,u,u,u,u,u,u]
383 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
384 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,5,10,15],zero,zero,zero,xmm4[3,u,u,u,u,u,u,u,u]
385 ; AVX2-NEXT: vpor %xmm3, %xmm4, %xmm3
386 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[0,5,10,15],zero,xmm1[u,u,u,u,u,u,u,u]
387 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
388 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,u,u,u,u,u,u,u,u]
389 ; AVX2-NEXT: vpor %xmm4, %xmm5, %xmm4
390 ; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[1,6,11],zero,zero,xmm1[u,u,u,u,u,u,u,u]
391 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1],xmm2[2],xmm0[3,4,5,6,7]
392 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,u,u,u,u,u,u,u,u]
393 ; AVX2-NEXT: vpor %xmm5, %xmm6, %xmm5
394 ; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm1[2,7,12],zero,zero,xmm1[u,u,u,u,u,u,u,u]
395 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
396 ; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,u,u,u,u,u,u,u,u]
397 ; AVX2-NEXT: vpor %xmm6, %xmm7, %xmm6
398 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,xmm1[u,u,u,u,u,u,u,u]
399 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4,5,6,7]
400 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,u,u,u,u,u,u,u,u]
401 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
402 ; AVX2-NEXT: vmovq %xmm3, (%rsi)
403 ; AVX2-NEXT: vmovq %xmm4, (%rdx)
404 ; AVX2-NEXT: vmovq %xmm5, (%rcx)
405 ; AVX2-NEXT: vmovq %xmm6, (%r8)
406 ; AVX2-NEXT: vmovq %xmm0, (%r9)
408 %wide.vec = load <40 x i8>, ptr %in.vec, align 64
409 %strided.vec0 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35>
410 %strided.vec1 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36>
411 %strided.vec2 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37>
412 %strided.vec3 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38>
413 %strided.vec4 = shufflevector <40 x i8> %wide.vec, <40 x i8> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39>
414 store <8 x i8> %strided.vec0, ptr %out.vec0, align 64
415 store <8 x i8> %strided.vec1, ptr %out.vec1, align 64
416 store <8 x i8> %strided.vec2, ptr %out.vec2, align 64
417 store <8 x i8> %strided.vec3, ptr %out.vec3, align 64
418 store <8 x i8> %strided.vec4, ptr %out.vec4, align 64
422 define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
423 ; SSE-LABEL: load_i8_stride5_vf16:
425 ; SSE-NEXT: movdqa 64(%rdi), %xmm9
426 ; SSE-NEXT: movdqa (%rdi), %xmm1
427 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
428 ; SSE-NEXT: movdqa 32(%rdi), %xmm10
429 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
430 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
431 ; SSE-NEXT: movdqa %xmm3, %xmm0
432 ; SSE-NEXT: pandn %xmm10, %xmm0
433 ; SSE-NEXT: movdqa %xmm2, %xmm4
434 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
435 ; SSE-NEXT: pand %xmm3, %xmm4
436 ; SSE-NEXT: por %xmm0, %xmm4
437 ; SSE-NEXT: pxor %xmm8, %xmm8
438 ; SSE-NEXT: movdqa %xmm4, %xmm0
439 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
440 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,1,3]
441 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
442 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
443 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
444 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,4,7]
445 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
446 ; SSE-NEXT: packuswb %xmm4, %xmm0
447 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
448 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
449 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
450 ; SSE-NEXT: movdqa %xmm4, %xmm5
451 ; SSE-NEXT: pandn %xmm6, %xmm5
452 ; SSE-NEXT: movdqa %xmm6, %xmm15
453 ; SSE-NEXT: movdqa %xmm1, %xmm6
454 ; SSE-NEXT: movdqa %xmm1, %xmm13
455 ; SSE-NEXT: pand %xmm4, %xmm6
456 ; SSE-NEXT: por %xmm5, %xmm6
457 ; SSE-NEXT: movdqa %xmm6, %xmm5
458 ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
459 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,0,0,65535,65535]
460 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
461 ; SSE-NEXT: pand %xmm7, %xmm6
462 ; SSE-NEXT: pandn %xmm5, %xmm7
463 ; SSE-NEXT: por %xmm6, %xmm7
464 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,2,1,3,4,5,6,7]
465 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,7]
466 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
467 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,1,3,4,5,6,7]
468 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,4,6,5,7]
469 ; SSE-NEXT: packuswb %xmm7, %xmm7
470 ; SSE-NEXT: pand %xmm11, %xmm7
471 ; SSE-NEXT: movdqa %xmm11, %xmm5
472 ; SSE-NEXT: pandn %xmm0, %xmm5
473 ; SSE-NEXT: por %xmm5, %xmm7
474 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
475 ; SSE-NEXT: pand %xmm6, %xmm7
476 ; SSE-NEXT: movdqa %xmm9, %xmm1
477 ; SSE-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
478 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
479 ; SSE-NEXT: movdqa %xmm1, %xmm0
480 ; SSE-NEXT: movdqa %xmm1, %xmm5
481 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
482 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
483 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3]
484 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
485 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
486 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
487 ; SSE-NEXT: packuswb %xmm0, %xmm0
488 ; SSE-NEXT: movdqa %xmm6, %xmm1
489 ; SSE-NEXT: pandn %xmm0, %xmm1
490 ; SSE-NEXT: por %xmm7, %xmm1
491 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
492 ; SSE-NEXT: movdqa %xmm4, %xmm7
493 ; SSE-NEXT: pandn %xmm10, %xmm7
494 ; SSE-NEXT: movdqa %xmm2, %xmm0
495 ; SSE-NEXT: pand %xmm4, %xmm0
496 ; SSE-NEXT: por %xmm7, %xmm0
497 ; SSE-NEXT: movdqa %xmm0, %xmm12
498 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
499 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
500 ; SSE-NEXT: movdqa %xmm0, %xmm7
501 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm12[0,0]
502 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm12[2,3]
503 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
504 ; SSE-NEXT: movdqa %xmm14, %xmm12
505 ; SSE-NEXT: movdqa %xmm13, %xmm1
506 ; SSE-NEXT: pandn %xmm13, %xmm12
507 ; SSE-NEXT: movdqa %xmm15, %xmm13
508 ; SSE-NEXT: movdqa %xmm15, %xmm2
509 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
510 ; SSE-NEXT: pand %xmm14, %xmm13
511 ; SSE-NEXT: por %xmm12, %xmm13
512 ; SSE-NEXT: movdqa %xmm13, %xmm12
513 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15]
514 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
515 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,0,0,65535,65535,65535,0]
516 ; SSE-NEXT: pand %xmm15, %xmm13
517 ; SSE-NEXT: pandn %xmm12, %xmm15
518 ; SSE-NEXT: por %xmm13, %xmm15
519 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,2,1,3]
520 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
521 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
522 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,2,3,0,4,5,6,7]
523 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,4,5,7]
524 ; SSE-NEXT: packuswb %xmm12, %xmm12
525 ; SSE-NEXT: pand %xmm11, %xmm12
526 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
527 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
528 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,1]
529 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
530 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
531 ; SSE-NEXT: psllq $48, %xmm0
532 ; SSE-NEXT: packuswb %xmm7, %xmm0
533 ; SSE-NEXT: movdqa %xmm5, %xmm7
534 ; SSE-NEXT: pandn %xmm0, %xmm11
535 ; SSE-NEXT: por %xmm11, %xmm12
536 ; SSE-NEXT: pand %xmm6, %xmm12
537 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm9[3,0]
538 ; SSE-NEXT: movaps %xmm9, %xmm0
539 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2]
540 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
541 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
542 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
543 ; SSE-NEXT: packuswb %xmm0, %xmm0
544 ; SSE-NEXT: movdqa %xmm6, %xmm5
545 ; SSE-NEXT: pandn %xmm0, %xmm5
546 ; SSE-NEXT: por %xmm12, %xmm5
547 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
548 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
549 ; SSE-NEXT: movdqa %xmm12, %xmm0
550 ; SSE-NEXT: pandn %xmm1, %xmm0
551 ; SSE-NEXT: movdqa %xmm1, %xmm5
552 ; SSE-NEXT: pand %xmm12, %xmm2
553 ; SSE-NEXT: por %xmm0, %xmm2
554 ; SSE-NEXT: movdqa %xmm2, %xmm0
555 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
556 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,65535,65535,65535]
557 ; SSE-NEXT: movdqa %xmm13, %xmm15
558 ; SSE-NEXT: pandn %xmm0, %xmm15
559 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
560 ; SSE-NEXT: pand %xmm13, %xmm2
561 ; SSE-NEXT: por %xmm15, %xmm2
562 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
563 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
564 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
565 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
566 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
567 ; SSE-NEXT: packuswb %xmm0, %xmm0
568 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,0,0,65535,65535,65535,65535,65535]
569 ; SSE-NEXT: pandn %xmm0, %xmm15
570 ; SSE-NEXT: movdqa %xmm4, %xmm0
571 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
572 ; SSE-NEXT: pandn %xmm11, %xmm0
573 ; SSE-NEXT: movdqa %xmm11, %xmm7
574 ; SSE-NEXT: pand %xmm14, %xmm7
575 ; SSE-NEXT: pandn %xmm10, %xmm14
576 ; SSE-NEXT: pand %xmm12, %xmm11
577 ; SSE-NEXT: pandn %xmm10, %xmm12
578 ; SSE-NEXT: pand %xmm4, %xmm10
579 ; SSE-NEXT: por %xmm0, %xmm10
580 ; SSE-NEXT: movdqa %xmm10, %xmm0
581 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
582 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
583 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0]
584 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[2,0],xmm0[3,0]
585 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[0,2]
586 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
587 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
589 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
590 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
591 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
592 ; SSE-NEXT: packuswb %xmm0, %xmm1
593 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
594 ; SSE-NEXT: por %xmm15, %xmm1
595 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
596 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
597 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
598 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
599 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,3,2,3,4,5,6,7]
600 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
601 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
602 ; SSE-NEXT: packuswb %xmm0, %xmm0
603 ; SSE-NEXT: movdqa %xmm6, %xmm10
604 ; SSE-NEXT: pandn %xmm0, %xmm10
605 ; SSE-NEXT: pand %xmm6, %xmm1
606 ; SSE-NEXT: por %xmm1, %xmm10
607 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
608 ; SSE-NEXT: movdqa %xmm15, %xmm0
609 ; SSE-NEXT: pand %xmm3, %xmm0
610 ; SSE-NEXT: pandn %xmm5, %xmm3
611 ; SSE-NEXT: por %xmm0, %xmm3
612 ; SSE-NEXT: movdqa %xmm3, %xmm0
613 ; SSE-NEXT: pxor %xmm1, %xmm1
614 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
615 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
616 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,0]
617 ; SSE-NEXT: por %xmm7, %xmm14
618 ; SSE-NEXT: movdqa %xmm14, %xmm0
619 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
620 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
621 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
622 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,6,5,6,7]
623 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
624 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
625 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
626 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
627 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
628 ; SSE-NEXT: packuswb %xmm1, %xmm0
629 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
630 ; SSE-NEXT: pand %xmm1, %xmm0
631 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5,6,5]
632 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[3,1,2,0]
633 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[3,0,1,2,4,5,6,7]
634 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,4,6,7]
635 ; SSE-NEXT: packuswb %xmm7, %xmm7
636 ; SSE-NEXT: pandn %xmm7, %xmm1
637 ; SSE-NEXT: movaps %xmm9, %xmm7
638 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm2[3,0]
639 ; SSE-NEXT: por %xmm1, %xmm0
640 ; SSE-NEXT: movaps %xmm2, %xmm1
641 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2]
642 ; SSE-NEXT: pand %xmm6, %xmm0
643 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
644 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
645 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
646 ; SSE-NEXT: packuswb %xmm1, %xmm1
647 ; SSE-NEXT: pandn %xmm1, %xmm6
648 ; SSE-NEXT: por %xmm0, %xmm6
649 ; SSE-NEXT: por %xmm11, %xmm12
650 ; SSE-NEXT: movdqa %xmm12, %xmm1
651 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
652 ; SSE-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
653 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,1,3]
654 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,3],xmm1[1,2]
655 ; SSE-NEXT: movdqa %xmm15, %xmm1
656 ; SSE-NEXT: pand %xmm4, %xmm1
657 ; SSE-NEXT: pandn %xmm5, %xmm4
658 ; SSE-NEXT: por %xmm1, %xmm4
659 ; SSE-NEXT: movdqa %xmm4, %xmm1
660 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
661 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
662 ; SSE-NEXT: pand %xmm13, %xmm4
663 ; SSE-NEXT: pandn %xmm1, %xmm13
664 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,2,3,1]
665 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
666 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
667 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
668 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
669 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
670 ; SSE-NEXT: packuswb %xmm1, %xmm0
671 ; SSE-NEXT: por %xmm4, %xmm13
672 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535]
673 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[3,1,0,3,4,5,6,7]
674 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,4]
675 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
676 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,0,4,5,6,7]
677 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
678 ; SSE-NEXT: packuswb %xmm4, %xmm4
679 ; SSE-NEXT: pand %xmm3, %xmm4
680 ; SSE-NEXT: pandn %xmm0, %xmm3
681 ; SSE-NEXT: por %xmm3, %xmm4
682 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
683 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
684 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
685 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
686 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
687 ; SSE-NEXT: packuswb %xmm1, %xmm2
688 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,1]
689 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
690 ; SSE-NEXT: movaps %xmm0, (%rsi)
691 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
692 ; SSE-NEXT: movaps %xmm0, (%rdx)
693 ; SSE-NEXT: movdqa %xmm10, (%rcx)
694 ; SSE-NEXT: movdqa %xmm6, (%r8)
695 ; SSE-NEXT: movaps %xmm4, (%r9)
698 ; AVX1-ONLY-LABEL: load_i8_stride5_vf16:
699 ; AVX1-ONLY: # %bb.0:
700 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm0
701 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
702 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm1
703 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm3
704 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
705 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
706 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
707 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
708 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
709 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
710 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
711 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
712 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
713 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm5
714 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm4
715 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
716 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm8, %xmm5
717 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,6,11],zero,zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
718 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u,u,u,u]
719 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8
720 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[3,8,13,u,u,u]
721 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm1[u,u,u]
722 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
723 ; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm8, %xmm9, %xmm6
724 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm6, %xmm6
725 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
726 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm8, %xmm6
727 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u,u,u,u]
728 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[u,u,u,u,u,u],zero,zero,zero,zero,xmm3[4,9,14,u,u,u]
729 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5,6,7]
730 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
731 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u]
732 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5,6,7]
733 ; AVX1-ONLY-NEXT: vpor %xmm8, %xmm9, %xmm8
734 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm8, %xmm8
735 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
736 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8
737 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u,u,u,u]
738 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
739 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5,6,7]
740 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
741 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
742 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4,5,6,7]
743 ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm10, %xmm9
744 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7
745 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
746 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7
747 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
748 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u,u,u,u,u]
749 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5],xmm2[6,7]
750 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u]
751 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u]
752 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7]
753 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm0, %xmm0
754 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
755 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
756 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsi)
757 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, (%rdx)
758 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, (%rcx)
759 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, (%r8)
760 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%r9)
761 ; AVX1-ONLY-NEXT: retq
763 ; AVX2-ONLY-LABEL: load_i8_stride5_vf16:
764 ; AVX2-ONLY: # %bb.0:
765 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm0
766 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1
767 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
768 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
769 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm2, %xmm3
770 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
771 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
772 ; AVX2-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2
773 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
774 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm3
775 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %xmm2
776 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
777 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm3, %xmm3
778 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
779 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
780 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
781 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5
782 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
783 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5
784 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5
785 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
786 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm5, %xmm5
787 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
788 ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
789 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm6, %xmm7
790 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
791 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
792 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6
793 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6
794 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
795 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6
796 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
797 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
798 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
799 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm7, %xmm7
800 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
801 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
802 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm4
803 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
804 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4
805 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
806 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
807 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
808 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
809 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,14],zero,zero,zero,xmm0[2,7,12],zero,zero,zero,xmm0[u,u,u,u]
810 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
811 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
812 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
813 ; AVX2-ONLY-NEXT: vmovdqa %xmm3, (%rsi)
814 ; AVX2-ONLY-NEXT: vmovdqa %xmm5, (%rdx)
815 ; AVX2-ONLY-NEXT: vmovdqa %xmm6, (%rcx)
816 ; AVX2-ONLY-NEXT: vmovdqa %xmm4, (%r8)
817 ; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%r9)
818 ; AVX2-ONLY-NEXT: vzeroupper
819 ; AVX2-ONLY-NEXT: retq
821 ; AVX512F-LABEL: load_i8_stride5_vf16:
823 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
824 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm4
825 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm5
826 ; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
827 ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm0
828 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
829 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12,u,u,u]
830 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[u,u,u]
831 ; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0
832 ; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
833 ; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm2
834 ; AVX512F-NEXT: vmovdqa 64(%rdi), %xmm0
835 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1,6,11]
836 ; AVX512F-NEXT: vpor %xmm6, %xmm2, %xmm6
837 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
838 ; AVX512F-NEXT: vmovdqa %ymm2, %ymm7
839 ; AVX512F-NEXT: vpternlogq $202, %ymm5, %ymm4, %ymm7
840 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u]
841 ; AVX512F-NEXT: vextracti128 $1, %ymm7, %xmm7
842 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[0,5,10,15],zero,zero,zero,xmm7[3,8,13,u,u,u]
843 ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7
844 ; AVX512F-NEXT: vpshufb %xmm3, %xmm7, %xmm7
845 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[2,7,12]
846 ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7
847 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
848 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm8
849 ; AVX512F-NEXT: vextracti128 $1, %ymm8, %xmm9
850 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14,u,u,u]
851 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,7,12],zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[u,u,u]
852 ; AVX512F-NEXT: vpor %xmm9, %xmm8, %xmm8
853 ; AVX512F-NEXT: vpshufb %xmm3, %xmm8, %xmm8
854 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[3,8,13]
855 ; AVX512F-NEXT: vpor %xmm9, %xmm8, %xmm8
856 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm1
857 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
858 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
859 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
860 ; AVX512F-NEXT: vpor %xmm1, %xmm9, %xmm1
861 ; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
862 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[4,9,14]
863 ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1
864 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm5, %ymm2
865 ; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3
866 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
867 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
868 ; AVX512F-NEXT: vpor %xmm3, %xmm2, %xmm2
869 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
870 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
871 ; AVX512F-NEXT: vmovdqa %xmm6, (%rsi)
872 ; AVX512F-NEXT: vmovdqa %xmm7, (%rdx)
873 ; AVX512F-NEXT: vmovdqa %xmm8, (%rcx)
874 ; AVX512F-NEXT: vmovdqa %xmm1, (%r8)
875 ; AVX512F-NEXT: vmovdqa %xmm0, (%r9)
876 ; AVX512F-NEXT: vzeroupper
879 ; AVX512BW-LABEL: load_i8_stride5_vf16:
881 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
882 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
883 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
884 ; AVX512BW-NEXT: kmovd %eax, %k1
885 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm2 {%k1}
886 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
887 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
888 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[u,u,u]
889 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
890 ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
891 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
892 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4
893 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[1,6,11]
894 ; AVX512BW-NEXT: vpor %xmm5, %xmm2, %xmm2
895 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
896 ; AVX512BW-NEXT: kmovd %eax, %k2
897 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm5 {%k2}
898 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
899 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5
900 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
901 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
902 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm5, %xmm5
903 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[2,7,12]
904 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
905 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
906 ; AVX512BW-NEXT: kmovd %eax, %k3
907 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm6 {%k3}
908 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
909 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
910 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,7,12],zero,zero,zero,xmm6[0,5,10,15],zero,zero,zero,xmm6[u,u,u]
911 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
912 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm6
913 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[3,8,13]
914 ; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
915 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm7 {%k1}
916 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
917 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7
918 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[2,7,12],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
919 ; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
920 ; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
921 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[4,9,14]
922 ; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3
923 ; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k2}
924 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm0
925 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11,u,u,u,u]
926 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,9,14],zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[u,u,u,u]
927 ; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
928 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
929 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
930 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
931 ; AVX512BW-NEXT: vmovdqa %xmm5, (%rdx)
932 ; AVX512BW-NEXT: vmovdqa %xmm6, (%rcx)
933 ; AVX512BW-NEXT: vmovdqa %xmm3, (%r8)
934 ; AVX512BW-NEXT: vmovdqa %xmm0, (%r9)
935 ; AVX512BW-NEXT: vzeroupper
936 ; AVX512BW-NEXT: retq
937 %wide.vec = load <80 x i8>, ptr %in.vec, align 64
938 %strided.vec0 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
939 %strided.vec1 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76>
940 %strided.vec2 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77>
941 %strided.vec3 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78>
942 %strided.vec4 = shufflevector <80 x i8> %wide.vec, <80 x i8> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79>
943 store <16 x i8> %strided.vec0, ptr %out.vec0, align 64
944 store <16 x i8> %strided.vec1, ptr %out.vec1, align 64
945 store <16 x i8> %strided.vec2, ptr %out.vec2, align 64
946 store <16 x i8> %strided.vec3, ptr %out.vec3, align 64
947 store <16 x i8> %strided.vec4, ptr %out.vec4, align 64
951 define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
952 ; SSE-LABEL: load_i8_stride5_vf32:
954 ; SSE-NEXT: subq $184, %rsp
955 ; SSE-NEXT: movdqa (%rdi), %xmm9
956 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
957 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
958 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
959 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
960 ; SSE-NEXT: movdqa 48(%rdi), %xmm2
961 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
962 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
963 ; SSE-NEXT: movdqa %xmm4, %xmm0
964 ; SSE-NEXT: pandn %xmm1, %xmm0
965 ; SSE-NEXT: movdqa %xmm2, %xmm1
966 ; SSE-NEXT: pand %xmm4, %xmm1
967 ; SSE-NEXT: por %xmm0, %xmm1
968 ; SSE-NEXT: pxor %xmm5, %xmm5
969 ; SSE-NEXT: movdqa %xmm1, %xmm0
970 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
971 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
972 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
973 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
974 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
975 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
976 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
977 ; SSE-NEXT: packuswb %xmm1, %xmm0
978 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,3]
979 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
980 ; SSE-NEXT: movdqa %xmm13, %xmm0
981 ; SSE-NEXT: pandn %xmm1, %xmm0
982 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
983 ; SSE-NEXT: movdqa %xmm15, %xmm1
984 ; SSE-NEXT: pandn %xmm3, %xmm1
985 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
986 ; SSE-NEXT: pandn %xmm9, %xmm11
987 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
988 ; SSE-NEXT: movdqa %xmm14, %xmm2
989 ; SSE-NEXT: pandn %xmm9, %xmm2
990 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
991 ; SSE-NEXT: movdqa %xmm4, %xmm2
992 ; SSE-NEXT: pandn %xmm9, %xmm2
993 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
994 ; SSE-NEXT: movdqa %xmm15, %xmm2
995 ; SSE-NEXT: pandn %xmm9, %xmm2
996 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
997 ; SSE-NEXT: pand %xmm15, %xmm9
998 ; SSE-NEXT: por %xmm1, %xmm9
999 ; SSE-NEXT: movdqa %xmm9, %xmm2
1000 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1001 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,0,0,65535,65535]
1002 ; SSE-NEXT: movdqa %xmm1, %xmm6
1003 ; SSE-NEXT: pandn %xmm2, %xmm6
1004 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
1005 ; SSE-NEXT: pand %xmm1, %xmm9
1006 ; SSE-NEXT: por %xmm6, %xmm9
1007 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,1,3,4,5,6,7]
1008 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
1009 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
1010 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
1011 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
1012 ; SSE-NEXT: packuswb %xmm2, %xmm2
1013 ; SSE-NEXT: pand %xmm13, %xmm2
1014 ; SSE-NEXT: por %xmm0, %xmm2
1015 ; SSE-NEXT: movdqa 64(%rdi), %xmm6
1016 ; SSE-NEXT: movdqa %xmm6, %xmm3
1017 ; SSE-NEXT: pxor %xmm0, %xmm0
1018 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1019 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1020 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
1021 ; SSE-NEXT: movdqa %xmm6, %xmm0
1022 ; SSE-NEXT: movdqa %xmm6, %xmm8
1023 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1024 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0]
1025 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[2,3]
1026 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
1027 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
1028 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
1029 ; SSE-NEXT: packuswb %xmm0, %xmm0
1030 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
1031 ; SSE-NEXT: movdqa %xmm9, %xmm6
1032 ; SSE-NEXT: pandn %xmm0, %xmm6
1033 ; SSE-NEXT: pand %xmm9, %xmm2
1034 ; SSE-NEXT: por %xmm2, %xmm6
1035 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1036 ; SSE-NEXT: movdqa 112(%rdi), %xmm10
1037 ; SSE-NEXT: movdqa %xmm4, %xmm0
1038 ; SSE-NEXT: pandn %xmm10, %xmm0
1039 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1040 ; SSE-NEXT: movdqa 128(%rdi), %xmm7
1041 ; SSE-NEXT: movdqa %xmm7, %xmm2
1042 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1043 ; SSE-NEXT: pand %xmm4, %xmm2
1044 ; SSE-NEXT: por %xmm0, %xmm2
1045 ; SSE-NEXT: movdqa %xmm2, %xmm0
1046 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1047 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,1,1,3]
1048 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
1049 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1050 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
1051 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
1052 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
1053 ; SSE-NEXT: packuswb %xmm2, %xmm0
1054 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
1055 ; SSE-NEXT: movdqa %xmm13, %xmm2
1056 ; SSE-NEXT: movdqa %xmm13, %xmm3
1057 ; SSE-NEXT: pandn %xmm0, %xmm2
1058 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
1059 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
1060 ; SSE-NEXT: movdqa %xmm15, %xmm0
1061 ; SSE-NEXT: pandn %xmm4, %xmm0
1062 ; SSE-NEXT: movdqa 80(%rdi), %xmm6
1063 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1064 ; SSE-NEXT: pand %xmm15, %xmm6
1065 ; SSE-NEXT: por %xmm0, %xmm6
1066 ; SSE-NEXT: movdqa %xmm6, %xmm0
1067 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1068 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1069 ; SSE-NEXT: pand %xmm1, %xmm6
1070 ; SSE-NEXT: pandn %xmm0, %xmm1
1071 ; SSE-NEXT: por %xmm6, %xmm1
1072 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,1,3,4,5,6,7]
1073 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
1074 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1075 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
1076 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
1077 ; SSE-NEXT: packuswb %xmm0, %xmm0
1078 ; SSE-NEXT: pand %xmm13, %xmm0
1079 ; SSE-NEXT: por %xmm2, %xmm0
1080 ; SSE-NEXT: movdqa 144(%rdi), %xmm12
1081 ; SSE-NEXT: movdqa %xmm12, %xmm2
1082 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1083 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1084 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15]
1085 ; SSE-NEXT: movdqa %xmm12, %xmm1
1086 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1087 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
1088 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
1089 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1090 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1091 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
1092 ; SSE-NEXT: packuswb %xmm1, %xmm1
1093 ; SSE-NEXT: movdqa %xmm9, %xmm2
1094 ; SSE-NEXT: pandn %xmm1, %xmm2
1095 ; SSE-NEXT: pand %xmm9, %xmm0
1096 ; SSE-NEXT: por %xmm0, %xmm2
1097 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1098 ; SSE-NEXT: movdqa %xmm15, %xmm0
1099 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1100 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1101 ; SSE-NEXT: pand %xmm15, %xmm1
1102 ; SSE-NEXT: por %xmm0, %xmm1
1103 ; SSE-NEXT: movdqa %xmm1, %xmm0
1104 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
1105 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1106 ; SSE-NEXT: movdqa %xmm1, %xmm2
1107 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
1108 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
1109 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,1,3]
1110 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,6,5,6,7]
1111 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
1112 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1113 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
1114 ; SSE-NEXT: psllq $48, %xmm1
1115 ; SSE-NEXT: packuswb %xmm0, %xmm1
1116 ; SSE-NEXT: movdqa %xmm13, %xmm2
1117 ; SSE-NEXT: pandn %xmm1, %xmm2
1118 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1119 ; SSE-NEXT: movdqa %xmm4, %xmm1
1120 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
1121 ; SSE-NEXT: pand %xmm13, %xmm1
1122 ; SSE-NEXT: por %xmm11, %xmm1
1123 ; SSE-NEXT: movdqa %xmm1, %xmm6
1124 ; SSE-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1125 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,0,65535,65535,65535,0]
1126 ; SSE-NEXT: movdqa %xmm0, %xmm11
1127 ; SSE-NEXT: pandn %xmm6, %xmm11
1128 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1129 ; SSE-NEXT: pand %xmm0, %xmm1
1130 ; SSE-NEXT: por %xmm11, %xmm1
1131 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1132 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
1133 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
1134 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
1135 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
1136 ; SSE-NEXT: packuswb %xmm1, %xmm1
1137 ; SSE-NEXT: pand %xmm3, %xmm1
1138 ; SSE-NEXT: movdqa %xmm3, %xmm11
1139 ; SSE-NEXT: por %xmm2, %xmm1
1140 ; SSE-NEXT: movdqa %xmm8, %xmm2
1141 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1142 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm6[3,0]
1143 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
1144 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,7,6,7]
1145 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
1146 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
1147 ; SSE-NEXT: packuswb %xmm2, %xmm2
1148 ; SSE-NEXT: movdqa %xmm9, %xmm3
1149 ; SSE-NEXT: pandn %xmm2, %xmm3
1150 ; SSE-NEXT: pand %xmm9, %xmm1
1151 ; SSE-NEXT: por %xmm1, %xmm3
1152 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1153 ; SSE-NEXT: movdqa %xmm15, %xmm2
1154 ; SSE-NEXT: pandn %xmm10, %xmm2
1155 ; SSE-NEXT: movdqa %xmm7, %xmm1
1156 ; SSE-NEXT: pand %xmm15, %xmm1
1157 ; SSE-NEXT: por %xmm2, %xmm1
1158 ; SSE-NEXT: movdqa %xmm1, %xmm2
1159 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1160 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1161 ; SSE-NEXT: movdqa %xmm1, %xmm6
1162 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm2[0,0]
1163 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,3]
1164 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2,1,3]
1165 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,7]
1166 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
1167 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
1168 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
1169 ; SSE-NEXT: psllq $48, %xmm1
1170 ; SSE-NEXT: packuswb %xmm2, %xmm1
1171 ; SSE-NEXT: movdqa %xmm13, %xmm2
1172 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1173 ; SSE-NEXT: pandn %xmm7, %xmm2
1174 ; SSE-NEXT: movdqa (%rsp), %xmm8 # 16-byte Reload
1175 ; SSE-NEXT: movdqa %xmm8, %xmm6
1176 ; SSE-NEXT: pand %xmm13, %xmm6
1177 ; SSE-NEXT: por %xmm2, %xmm6
1178 ; SSE-NEXT: movdqa %xmm6, %xmm2
1179 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1180 ; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1181 ; SSE-NEXT: pand %xmm0, %xmm6
1182 ; SSE-NEXT: pandn %xmm2, %xmm0
1183 ; SSE-NEXT: por %xmm6, %xmm0
1184 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1185 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1186 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
1187 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
1188 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
1189 ; SSE-NEXT: packuswb %xmm0, %xmm0
1190 ; SSE-NEXT: movdqa %xmm11, %xmm2
1191 ; SSE-NEXT: pand %xmm11, %xmm0
1192 ; SSE-NEXT: pandn %xmm1, %xmm2
1193 ; SSE-NEXT: por %xmm2, %xmm0
1194 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1195 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
1196 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
1197 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,7,6,7]
1198 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
1199 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,5,4]
1200 ; SSE-NEXT: packuswb %xmm1, %xmm1
1201 ; SSE-NEXT: movdqa %xmm9, %xmm2
1202 ; SSE-NEXT: pandn %xmm1, %xmm2
1203 ; SSE-NEXT: pand %xmm9, %xmm0
1204 ; SSE-NEXT: por %xmm0, %xmm2
1205 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1206 ; SSE-NEXT: pand %xmm14, %xmm4
1207 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1208 ; SSE-NEXT: movdqa %xmm4, %xmm2
1209 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1210 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,0,65535,65535,65535]
1211 ; SSE-NEXT: movdqa %xmm3, %xmm6
1212 ; SSE-NEXT: pandn %xmm2, %xmm6
1213 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1214 ; SSE-NEXT: pand %xmm3, %xmm4
1215 ; SSE-NEXT: por %xmm6, %xmm4
1216 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,1,2,3,4,5,6,7]
1217 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1218 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1219 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1220 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1221 ; SSE-NEXT: packuswb %xmm0, %xmm0
1222 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,65535,65535,65535,65535,65535]
1223 ; SSE-NEXT: movdqa %xmm1, %xmm2
1224 ; SSE-NEXT: movdqa %xmm1, %xmm10
1225 ; SSE-NEXT: pandn %xmm0, %xmm2
1226 ; SSE-NEXT: movdqa %xmm15, %xmm0
1227 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1228 ; SSE-NEXT: pandn %xmm1, %xmm0
1229 ; SSE-NEXT: movdqa %xmm13, %xmm6
1230 ; SSE-NEXT: movdqa %xmm13, %xmm12
1231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
1232 ; SSE-NEXT: pandn %xmm11, %xmm6
1233 ; SSE-NEXT: movdqa %xmm14, %xmm4
1234 ; SSE-NEXT: pandn %xmm11, %xmm4
1235 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1236 ; SSE-NEXT: pand %xmm15, %xmm11
1237 ; SSE-NEXT: movdqa %xmm15, %xmm4
1238 ; SSE-NEXT: por %xmm0, %xmm11
1239 ; SSE-NEXT: movdqa %xmm11, %xmm0
1240 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
1241 ; SSE-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
1242 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,1,2,0]
1243 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[3,0]
1244 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm11[0,2]
1245 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
1246 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1247 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1248 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1249 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
1250 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm13[0,1,2,3,4,5,6,5]
1251 ; SSE-NEXT: packuswb %xmm0, %xmm11
1252 ; SSE-NEXT: pand %xmm10, %xmm11
1253 ; SSE-NEXT: por %xmm2, %xmm11
1254 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1255 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
1256 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1257 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
1258 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1259 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
1260 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1261 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
1262 ; SSE-NEXT: packuswb %xmm0, %xmm0
1263 ; SSE-NEXT: movdqa %xmm9, %xmm2
1264 ; SSE-NEXT: pandn %xmm0, %xmm2
1265 ; SSE-NEXT: pand %xmm9, %xmm11
1266 ; SSE-NEXT: por %xmm11, %xmm2
1267 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1268 ; SSE-NEXT: movdqa %xmm14, %xmm0
1269 ; SSE-NEXT: pandn %xmm7, %xmm0
1270 ; SSE-NEXT: movdqa %xmm8, %xmm15
1271 ; SSE-NEXT: movdqa %xmm8, %xmm2
1272 ; SSE-NEXT: pand %xmm14, %xmm2
1273 ; SSE-NEXT: por %xmm0, %xmm2
1274 ; SSE-NEXT: movdqa %xmm2, %xmm0
1275 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
1276 ; SSE-NEXT: movdqa %xmm3, %xmm11
1277 ; SSE-NEXT: pandn %xmm0, %xmm11
1278 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1279 ; SSE-NEXT: pand %xmm3, %xmm2
1280 ; SSE-NEXT: por %xmm11, %xmm2
1281 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,2,3,4,5,6,7]
1282 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
1283 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1284 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
1285 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1286 ; SSE-NEXT: packuswb %xmm0, %xmm0
1287 ; SSE-NEXT: movdqa %xmm10, %xmm13
1288 ; SSE-NEXT: pandn %xmm0, %xmm13
1289 ; SSE-NEXT: movdqa %xmm4, %xmm11
1290 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1291 ; SSE-NEXT: pandn %xmm2, %xmm11
1292 ; SSE-NEXT: movdqa %xmm1, %xmm5
1293 ; SSE-NEXT: movdqa %xmm1, %xmm0
1294 ; SSE-NEXT: movdqa %xmm12, %xmm1
1295 ; SSE-NEXT: pand %xmm12, %xmm0
1296 ; SSE-NEXT: movdqa %xmm2, %xmm7
1297 ; SSE-NEXT: pand %xmm12, %xmm7
1298 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
1299 ; SSE-NEXT: pandn %xmm8, %xmm1
1300 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1301 ; SSE-NEXT: pand %xmm14, %xmm5
1302 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1303 ; SSE-NEXT: pand %xmm14, %xmm2
1304 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1305 ; SSE-NEXT: pandn %xmm8, %xmm14
1306 ; SSE-NEXT: pand %xmm4, %xmm8
1307 ; SSE-NEXT: por %xmm11, %xmm8
1308 ; SSE-NEXT: movdqa %xmm8, %xmm11
1309 ; SSE-NEXT: pxor %xmm1, %xmm1
1310 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm1[8],xmm11[9],xmm1[9],xmm11[10],xmm1[10],xmm11[11],xmm1[11],xmm11[12],xmm1[12],xmm11[13],xmm1[13],xmm11[14],xmm1[14],xmm11[15],xmm1[15]
1311 ; SSE-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
1312 ; SSE-NEXT: pxor %xmm2, %xmm2
1313 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,2,0]
1314 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm11[3,0]
1315 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm8[0,2]
1316 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[2,1,2,3,4,5,6,7]
1317 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
1318 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
1319 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,2,1,4,5,6,7]
1320 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
1321 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1322 ; SSE-NEXT: packuswb %xmm8, %xmm1
1323 ; SSE-NEXT: pand %xmm10, %xmm1
1324 ; SSE-NEXT: por %xmm13, %xmm1
1325 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1326 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,1,1]
1327 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1328 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[0,2,2,3]
1329 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1330 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm11[0,3,2,3,4,5,6,7]
1331 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
1332 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,5]
1333 ; SSE-NEXT: packuswb %xmm8, %xmm11
1334 ; SSE-NEXT: movdqa %xmm9, %xmm12
1335 ; SSE-NEXT: pandn %xmm11, %xmm12
1336 ; SSE-NEXT: pand %xmm9, %xmm1
1337 ; SSE-NEXT: por %xmm1, %xmm12
1338 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1339 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
1340 ; SSE-NEXT: pand %xmm13, %xmm1
1341 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1342 ; SSE-NEXT: movdqa %xmm1, %xmm11
1343 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
1344 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1345 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm11[2,0]
1346 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
1347 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
1348 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,1,2,4,5,6,7]
1349 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,6,7]
1350 ; SSE-NEXT: packuswb %xmm1, %xmm1
1351 ; SSE-NEXT: movdqa %xmm10, %xmm11
1352 ; SSE-NEXT: pandn %xmm1, %xmm11
1353 ; SSE-NEXT: por %xmm6, %xmm0
1354 ; SSE-NEXT: movdqa %xmm0, %xmm1
1355 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1356 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1357 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
1358 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
1359 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
1360 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
1361 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
1362 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1363 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
1364 ; SSE-NEXT: packuswb %xmm0, %xmm1
1365 ; SSE-NEXT: pand %xmm10, %xmm1
1366 ; SSE-NEXT: por %xmm11, %xmm1
1367 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1368 ; SSE-NEXT: movaps %xmm10, %xmm0
1369 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
1370 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[3,0]
1371 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
1372 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,6,6,7]
1373 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
1374 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,5]
1375 ; SSE-NEXT: packuswb %xmm0, %xmm6
1376 ; SSE-NEXT: movdqa %xmm9, %xmm8
1377 ; SSE-NEXT: pandn %xmm6, %xmm8
1378 ; SSE-NEXT: pand %xmm9, %xmm1
1379 ; SSE-NEXT: por %xmm1, %xmm8
1380 ; SSE-NEXT: movdqa %xmm13, %xmm0
1381 ; SSE-NEXT: pand %xmm13, %xmm15
1382 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1383 ; SSE-NEXT: por %xmm15, %xmm0
1384 ; SSE-NEXT: movdqa %xmm0, %xmm1
1385 ; SSE-NEXT: pxor %xmm6, %xmm6
1386 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
1387 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
1388 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
1389 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1390 ; SSE-NEXT: por %xmm7, %xmm2
1391 ; SSE-NEXT: movdqa %xmm2, %xmm1
1392 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
1393 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm6[8],xmm2[9],xmm6[9],xmm2[10],xmm6[10],xmm2[11],xmm6[11],xmm2[12],xmm6[12],xmm2[13],xmm6[13],xmm2[14],xmm6[14],xmm2[15],xmm6[15]
1394 ; SSE-NEXT: pxor %xmm13, %xmm13
1395 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,0]
1396 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1397 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
1398 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
1399 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
1400 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
1401 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6]
1402 ; SSE-NEXT: packuswb %xmm2, %xmm1
1403 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,65535,65535,65535,65535,65535]
1404 ; SSE-NEXT: pand %xmm2, %xmm1
1405 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,5]
1406 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[3,1,2,0]
1407 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,0,1,2,4,5,6,7]
1408 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,4,6,7]
1409 ; SSE-NEXT: packuswb %xmm6, %xmm6
1410 ; SSE-NEXT: pandn %xmm6, %xmm2
1411 ; SSE-NEXT: por %xmm2, %xmm1
1412 ; SSE-NEXT: movdqa %xmm4, %xmm2
1413 ; SSE-NEXT: movdqa %xmm4, %xmm15
1414 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[3,0]
1415 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2]
1416 ; SSE-NEXT: pand %xmm9, %xmm1
1417 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7]
1418 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
1419 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
1420 ; SSE-NEXT: packuswb %xmm2, %xmm2
1421 ; SSE-NEXT: pandn %xmm2, %xmm9
1422 ; SSE-NEXT: por %xmm1, %xmm9
1423 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1424 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1425 ; SSE-NEXT: movdqa %xmm0, %xmm1
1426 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
1427 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
1428 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
1429 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
1430 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
1431 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
1432 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
1433 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1434 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,3,0,4,5,6,7]
1435 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
1436 ; SSE-NEXT: packuswb %xmm1, %xmm2
1437 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
1438 ; SSE-NEXT: movdqa %xmm4, %xmm6
1439 ; SSE-NEXT: pandn %xmm2, %xmm6
1440 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1441 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
1442 ; SSE-NEXT: pand %xmm5, %xmm2
1443 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1444 ; SSE-NEXT: movdqa %xmm2, %xmm0
1445 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3],xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
1446 ; SSE-NEXT: movdqa %xmm3, %xmm11
1447 ; SSE-NEXT: pandn %xmm0, %xmm11
1448 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
1449 ; SSE-NEXT: pand %xmm3, %xmm2
1450 ; SSE-NEXT: por %xmm11, %xmm2
1451 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,0,3,4,5,6,7]
1452 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
1453 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
1454 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
1455 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
1456 ; SSE-NEXT: packuswb %xmm2, %xmm2
1457 ; SSE-NEXT: pand %xmm4, %xmm2
1458 ; SSE-NEXT: por %xmm6, %xmm2
1459 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1460 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
1461 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
1462 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
1463 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,0,3,4,5,6,7]
1464 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
1465 ; SSE-NEXT: packuswb %xmm1, %xmm10
1466 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,1]
1467 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1468 ; SSE-NEXT: movdqa %xmm14, %xmm1
1469 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
1470 ; SSE-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1471 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,1,1,3]
1472 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,3],xmm1[1,2]
1473 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
1474 ; SSE-NEXT: pand %xmm5, %xmm0
1475 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1476 ; SSE-NEXT: por %xmm0, %xmm5
1477 ; SSE-NEXT: movdqa %xmm5, %xmm1
1478 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3],xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
1479 ; SSE-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
1480 ; SSE-NEXT: pand %xmm3, %xmm5
1481 ; SSE-NEXT: pandn %xmm1, %xmm3
1482 ; SSE-NEXT: por %xmm5, %xmm3
1483 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[3,1,0,3,4,5,6,7]
1484 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
1485 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
1486 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7]
1487 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
1488 ; SSE-NEXT: packuswb %xmm1, %xmm1
1489 ; SSE-NEXT: pand %xmm4, %xmm1
1490 ; SSE-NEXT: movdqa %xmm4, %xmm7
1491 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2,3,1]
1492 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[3,1,2,3,4,5,6,7]
1493 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
1494 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1495 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,0,4,5,6,7]
1496 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5,4,7]
1497 ; SSE-NEXT: packuswb %xmm3, %xmm4
1498 ; SSE-NEXT: pandn %xmm4, %xmm7
1499 ; SSE-NEXT: por %xmm7, %xmm1
1500 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
1501 ; SSE-NEXT: # xmm4 = mem[3,1,2,3]
1502 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3]
1503 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
1504 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
1505 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1506 ; SSE-NEXT: packuswb %xmm3, %xmm5
1507 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,1]
1508 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1509 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
1510 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1511 ; SSE-NEXT: movaps %xmm0, (%rsi)
1512 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1513 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1514 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1515 ; SSE-NEXT: movaps %xmm0, (%rdx)
1516 ; SSE-NEXT: movdqa %xmm12, 16(%rcx)
1517 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1518 ; SSE-NEXT: movaps %xmm0, (%rcx)
1519 ; SSE-NEXT: movdqa %xmm9, 16(%r8)
1520 ; SSE-NEXT: movdqa %xmm8, (%r8)
1521 ; SSE-NEXT: movaps %xmm1, 16(%r9)
1522 ; SSE-NEXT: movaps %xmm2, (%r9)
1523 ; SSE-NEXT: addq $184, %rsp
1526 ; AVX1-ONLY-LABEL: load_i8_stride5_vf32:
1527 ; AVX1-ONLY: # %bb.0:
1528 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0
1529 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[1,6,11]
1530 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm1
1531 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
1532 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm5
1533 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm2
1534 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13,u,u,u,u,u,u]
1535 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm3
1536 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u,u,4,9,14],zero,zero,zero,xmm3[u,u,u,u,u,u]
1537 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
1538 ; AVX1-ONLY-NEXT: vpxor %xmm6, %xmm6, %xmm6
1539 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3,4,5,6,7]
1540 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm4
1541 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,5,10,15],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
1542 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6
1543 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1544 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm7
1545 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm9
1546 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8
1547 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm10
1548 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
1549 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
1550 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
1551 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
1552 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
1553 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
1554 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
1555 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
1556 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
1557 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11
1558 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
1559 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11]
1560 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm14
1561 ; AVX1-ONLY-NEXT: vorps %ymm14, %ymm11, %ymm11
1562 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm11, %ymm6
1563 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1564 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[2,7,12]
1565 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
1566 ; AVX1-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11
1567 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
1568 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm9[0,5,10,15,u,u,u,u,u,u,u,u,u]
1569 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm15, %xmm14
1570 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13,u,u,u]
1571 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u,u,u,u,u,4,9,14],zero,zero,zero,xmm8[u,u,u]
1572 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm15, %xmm6
1573 ; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm14, %xmm6, %xmm6
1574 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u]
1575 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u]
1576 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
1577 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u>
1578 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1579 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
1580 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13
1581 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2,3,4],xmm11[5,6,7]
1582 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm12, %ymm6
1583 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12]
1584 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13
1585 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6
1586 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm6, %ymm6
1587 ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1588 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
1589 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
1590 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6
1591 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,1,6,11],zero,zero,zero,zero,xmm3[u,u,u,u,u,u]
1592 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm2[u,u,u],zero,zero,zero,xmm2[0,5,10,15,u,u,u,u,u,u]
1593 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13
1594 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1595 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,7,12],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
1596 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm13, %xmm13
1597 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3,4],xmm6[5,6,7]
1598 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm9[1,6,11,u,u,u,u,u,u,u,u,u,u]
1599 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
1600 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7]
1601 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
1602 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,0,5,10,15],zero,zero,zero,xmm8[u,u,u]
1603 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
1604 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
1605 ; AVX1-ONLY-NEXT: vandps %ymm12, %ymm13, %ymm13
1606 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
1607 ; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm12, %ymm12
1608 ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12
1609 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm12
1610 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm9[2,7,12,u,u,u,u,u,u,u,u,u,u]
1611 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u]
1612 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm13[3,4,5,6,7]
1613 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
1614 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
1615 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7]
1616 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm13, %xmm6
1617 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u]
1618 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
1619 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm14, %xmm13
1620 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u>
1621 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm13
1622 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
1623 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13
1624 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14]
1625 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13
1626 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
1627 ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm15, %ymm6
1628 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm15, %ymm13
1629 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm13, %ymm6
1630 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm0[4,9,14]
1631 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
1632 ; AVX1-ONLY-NEXT: vpor %xmm13, %xmm15, %xmm13
1633 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm15
1634 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
1635 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm15, %xmm13, %xmm13
1636 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
1637 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11,u,u,u,u]
1638 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[3,8,13,u,u,u,u,u,u,u,u,u,u]
1639 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4,5],xmm9[6,7]
1640 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,7,12],zero,zero,zero,xmm8[u,u,u,u]
1641 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
1642 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7]
1643 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm9, %xmm7
1644 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[2,7,12,u,u,u,u,u,u,u]
1645 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,3,8,13],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
1646 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
1647 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm2, %xmm2
1648 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[4,9,14],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
1649 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm2, %xmm2
1650 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,1,6,11],zero,zero,zero,zero
1651 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[0,5,10,15]
1652 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
1653 ; AVX1-ONLY-NEXT: vpblendvb %xmm11, %xmm2, %xmm0, %xmm0
1654 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15]
1655 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1656 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
1657 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
1658 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1659 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi)
1660 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1661 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
1662 ; AVX1-ONLY-NEXT: vmovaps %ymm12, (%rcx)
1663 ; AVX1-ONLY-NEXT: vmovaps %ymm6, (%r8)
1664 ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9)
1665 ; AVX1-ONLY-NEXT: vzeroupper
1666 ; AVX1-ONLY-NEXT: retq
1668 ; AVX2-ONLY-LABEL: load_i8_stride5_vf32:
1669 ; AVX2-ONLY: # %bb.0:
1670 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm0
1671 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm4
1672 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm5
1673 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm1
1674 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2
1675 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
1676 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
1677 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm6, %xmm7
1678 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm7[4,9,14],zero,zero,zero,xmm7[2,7,12,u,u,u]
1679 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,5,10,15],zero,zero,zero,xmm6[3,8,13],zero,zero,zero,xmm6[u,u,u]
1680 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm6, %xmm6
1681 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
1682 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7
1683 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
1684 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
1685 ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1]
1686 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1687 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
1688 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
1689 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm6
1690 ; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm7
1691 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[1,6,11]
1692 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm8
1693 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
1694 ; AVX2-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
1695 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1696 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15]
1697 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
1698 ; AVX2-ONLY-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm9
1699 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u]
1700 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm9
1701 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[3,8,13,u,u,u]
1702 ; AVX2-ONLY-NEXT: vpor %xmm11, %xmm9, %xmm9
1703 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
1704 ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm12
1705 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
1706 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
1707 ; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1]
1708 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
1709 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
1710 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9
1711 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[2,7,12]
1712 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
1713 ; AVX2-ONLY-NEXT: vpor %xmm12, %xmm13, %xmm12
1714 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
1715 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0,1,2,3,4],ymm12[5,6,7],ymm9[8,9,10,11,12],ymm12[13,14,15]
1716 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7]
1717 ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
1718 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm11, %xmm12
1719 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14,u,u,u]
1720 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero,xmm11[u,u,u]
1721 ; AVX2-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11
1722 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
1723 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
1724 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
1725 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
1726 ; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1]
1727 ; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
1728 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
1729 ; AVX2-ONLY-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
1730 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[3,8,13]
1731 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
1732 ; AVX2-ONLY-NEXT: vpor %xmm11, %xmm13, %xmm11
1733 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1734 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15]
1735 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
1736 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm11
1737 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u]
1738 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm11, %xmm11
1739 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[2,7,12],zero,zero,zero,xmm11[0,5,10,15,u,u,u]
1740 ; AVX2-ONLY-NEXT: vpor %xmm12, %xmm11, %xmm11
1741 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
1742 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm13
1743 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
1744 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
1745 ; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1]
1746 ; AVX2-ONLY-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
1747 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
1748 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u>
1749 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm11
1750 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[4,9,14]
1751 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
1752 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm8, %xmm7
1753 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1754 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
1755 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm11, %ymm7, %ymm7
1756 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
1757 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm5
1758 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[1,6,11,u,u,u,u]
1759 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,9,14],zero,zero,zero,xmm4[2,7,12],zero,zero,zero,xmm4[u,u,u,u]
1760 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4
1761 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
1762 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
1763 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
1764 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
1765 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
1766 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
1767 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
1768 ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
1769 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
1770 ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0
1771 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
1772 ; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi)
1773 ; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rdx)
1774 ; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rcx)
1775 ; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%r8)
1776 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9)
1777 ; AVX2-ONLY-NEXT: vzeroupper
1778 ; AVX2-ONLY-NEXT: retq
1780 ; AVX512F-LABEL: load_i8_stride5_vf32:
1782 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
1783 ; AVX512F-NEXT: vmovdqa (%rdi), %ymm3
1784 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm4
1785 ; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm1
1786 ; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm2
1787 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm5
1788 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm5
1789 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
1790 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
1791 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
1792 ; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm6
1793 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
1794 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm7
1795 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm7
1796 ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
1797 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8
1798 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero
1799 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
1800 ; AVX512F-NEXT: vpternlogq $248, %ymm10, %ymm6, %ymm9
1801 ; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm7
1802 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[1,6,11]
1803 ; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm8
1804 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
1805 ; AVX512F-NEXT: vpor %xmm6, %xmm11, %xmm6
1806 ; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1807 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6,7],ymm9[8,9,10,11,12],ymm6[13,14,15]
1808 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1809 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm9
1810 ; AVX512F-NEXT: vpternlogq $202, %ymm4, %ymm3, %ymm9
1811 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm9[1,6,11],zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[u,u,u]
1812 ; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm9
1813 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[3,8,13,u,u,u]
1814 ; AVX512F-NEXT: vpor %xmm11, %xmm9, %xmm9
1815 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
1816 ; AVX512F-NEXT: vmovdqa %ymm11, %ymm12
1817 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12
1818 ; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
1819 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
1820 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,7,12,17,22,27,16,21,26,31,20,25,30],zero,zero,zero,zero,zero,zero
1821 ; AVX512F-NEXT: vpternlogq $248, %ymm10, %ymm9, %ymm12
1822 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[2,7,12]
1823 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
1824 ; AVX512F-NEXT: vpor %xmm9, %xmm13, %xmm9
1825 ; AVX512F-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1826 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5,6,7],ymm12[8,9,10,11,12],ymm9[13,14,15]
1827 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1828 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm11
1829 ; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm12
1830 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14,u,u,u]
1831 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero,xmm11[u,u,u]
1832 ; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11
1833 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm12
1834 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12
1835 ; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
1836 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
1837 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,8,13,18,23,28,17,22,27,16,21,26,31],zero,zero,zero,zero,zero,zero
1838 ; AVX512F-NEXT: vpternlogq $248, %ymm10, %ymm11, %ymm12
1839 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm7[3,8,13]
1840 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
1841 ; AVX512F-NEXT: vpor %xmm10, %xmm11, %xmm10
1842 ; AVX512F-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1843 ; AVX512F-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7],ymm12[8,9,10,11,12],ymm10[13,14,15]
1844 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
1845 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm11
1846 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm11
1847 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u]
1848 ; AVX512F-NEXT: vextracti128 $1, %ymm11, %xmm11
1849 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[2,7,12],zero,zero,zero,xmm11[0,5,10,15,u,u,u]
1850 ; AVX512F-NEXT: vpor %xmm12, %xmm11, %xmm11
1851 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm12
1852 ; AVX512F-NEXT: vpternlogq $202, %ymm1, %ymm2, %ymm12
1853 ; AVX512F-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
1854 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm12, %ymm13
1855 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
1856 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
1857 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm7[4,9,14]
1858 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
1859 ; AVX512F-NEXT: vpor %xmm7, %xmm8, %xmm7
1860 ; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1861 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
1862 ; AVX512F-NEXT: vpternlogq $184, %ymm12, %ymm8, %ymm7
1863 ; AVX512F-NEXT: vpternlogq $202, %ymm3, %ymm4, %ymm5
1864 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm3
1865 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1866 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
1867 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
1868 ; AVX512F-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0
1869 ; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1870 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1871 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
1872 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
1873 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm1
1874 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
1875 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
1876 ; AVX512F-NEXT: vpermd %ymm1, %ymm2, %ymm1
1877 ; AVX512F-NEXT: vpternlogq $184, %ymm0, %ymm8, %ymm1
1878 ; AVX512F-NEXT: vmovdqa %ymm6, (%rsi)
1879 ; AVX512F-NEXT: vmovdqa %ymm9, (%rdx)
1880 ; AVX512F-NEXT: vmovdqa %ymm10, (%rcx)
1881 ; AVX512F-NEXT: vmovdqa %ymm7, (%r8)
1882 ; AVX512F-NEXT: vmovdqa %ymm1, (%r9)
1883 ; AVX512F-NEXT: vzeroupper
1884 ; AVX512F-NEXT: retq
1886 ; AVX512BW-LABEL: load_i8_stride5_vf32:
1887 ; AVX512BW: # %bb.0:
1888 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
1889 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
1890 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0
1891 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1
1892 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
1893 ; AVX512BW-NEXT: kmovd %eax, %k1
1894 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k1}
1895 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
1896 ; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
1897 ; AVX512BW-NEXT: kmovd %eax, %k2
1898 ; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k2}
1899 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
1900 ; AVX512BW-NEXT: kmovd %eax, %k2
1901 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
1902 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
1903 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
1904 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
1905 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm5
1906 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
1907 ; AVX512BW-NEXT: kmovd %eax, %k3
1908 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 {%k3} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
1909 ; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm6
1910 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11]
1911 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm7
1912 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
1913 ; AVX512BW-NEXT: vpor %xmm4, %xmm8, %xmm4
1914 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1915 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
1916 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1917 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
1918 ; AVX512BW-NEXT: kmovd %eax, %k4
1919 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm5 {%k4}
1920 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
1921 ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
1922 ; AVX512BW-NEXT: kmovd %eax, %k5
1923 ; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm5 {%k5}
1924 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
1925 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[1,6,11],zero,zero,zero,zero,xmm8[4,9,14],zero,zero,zero,xmm8[u,u,u]
1926 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8
1927 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13,u,u,u]
1928 ; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
1929 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k3} = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
1930 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[2,7,12]
1931 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
1932 ; AVX512BW-NEXT: vpor %xmm5, %xmm9, %xmm5
1933 ; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1934 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7],ymm8[8,9,10,11,12],ymm5[13,14,15]
1935 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1936 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm8 {%k2}
1937 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
1938 ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000
1939 ; AVX512BW-NEXT: kmovd %eax, %k5
1940 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k5}
1941 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k4}
1942 ; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm10
1943 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
1944 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
1945 ; AVX512BW-NEXT: vpor %xmm10, %xmm9, %xmm9
1946 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 {%k3} = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
1947 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[3,8,13]
1948 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
1949 ; AVX512BW-NEXT: vpor %xmm8, %xmm10, %xmm8
1950 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1951 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
1952 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1953 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm9 {%k1}
1954 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
1955 ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000
1956 ; AVX512BW-NEXT: kmovd %eax, %k3
1957 ; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm9 {%k3}
1958 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm10 {%k2}
1959 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[3,8,13],zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u]
1960 ; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm10
1961 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[2,7,12],zero,zero,zero,xmm10[0,5,10,15,u,u,u]
1962 ; AVX512BW-NEXT: vpor %xmm11, %xmm10, %xmm10
1963 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
1964 ; AVX512BW-NEXT: kmovd %eax, %k3
1965 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k3} = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
1966 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
1967 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
1968 ; AVX512BW-NEXT: vpor %xmm6, %xmm7, %xmm6
1969 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1970 ; AVX512BW-NEXT: movl $-33554432, %eax # imm = 0xFE000000
1971 ; AVX512BW-NEXT: kmovd %eax, %k3
1972 ; AVX512BW-NEXT: vmovdqu8 %ymm6, %ymm10 {%k3}
1973 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k1}
1974 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
1975 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
1976 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
1977 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
1978 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k2}
1979 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
1980 ; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000
1981 ; AVX512BW-NEXT: kmovd %eax, %k1
1982 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
1983 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
1984 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
1985 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1
1986 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
1987 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,5,0,5,0,5,0,5]
1988 ; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm1
1989 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k3}
1990 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
1991 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
1992 ; AVX512BW-NEXT: vmovdqa %ymm8, (%rcx)
1993 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r8)
1994 ; AVX512BW-NEXT: vmovdqa %ymm0, (%r9)
1995 ; AVX512BW-NEXT: vzeroupper
1996 ; AVX512BW-NEXT: retq
1997 %wide.vec = load <160 x i8>, ptr %in.vec, align 64
1998 %strided.vec0 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155>
1999 %strided.vec1 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156>
2000 %strided.vec2 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157>
2001 %strided.vec3 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158>
2002 %strided.vec4 = shufflevector <160 x i8> %wide.vec, <160 x i8> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159>
2003 store <32 x i8> %strided.vec0, ptr %out.vec0, align 64
2004 store <32 x i8> %strided.vec1, ptr %out.vec1, align 64
2005 store <32 x i8> %strided.vec2, ptr %out.vec2, align 64
2006 store <32 x i8> %strided.vec3, ptr %out.vec3, align 64
2007 store <32 x i8> %strided.vec4, ptr %out.vec4, align 64
2011 define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind {
2012 ; SSE-LABEL: load_i8_stride5_vf64:
2014 ; SSE-NEXT: subq $552, %rsp # imm = 0x228
2015 ; SSE-NEXT: movdqa 160(%rdi), %xmm9
2016 ; SSE-NEXT: movdqa 176(%rdi), %xmm3
2017 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2018 ; SSE-NEXT: movdqa 208(%rdi), %xmm4
2019 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2020 ; SSE-NEXT: movdqa 192(%rdi), %xmm1
2021 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2022 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2023 ; SSE-NEXT: movdqa %xmm2, %xmm0
2024 ; SSE-NEXT: pandn %xmm1, %xmm0
2025 ; SSE-NEXT: movdqa %xmm4, %xmm1
2026 ; SSE-NEXT: pand %xmm2, %xmm1
2027 ; SSE-NEXT: movdqa %xmm2, %xmm14
2028 ; SSE-NEXT: por %xmm0, %xmm1
2029 ; SSE-NEXT: pxor %xmm12, %xmm12
2030 ; SSE-NEXT: movdqa %xmm1, %xmm0
2031 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
2032 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2033 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2034 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
2035 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2036 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2037 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2038 ; SSE-NEXT: packuswb %xmm1, %xmm0
2039 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
2040 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
2041 ; SSE-NEXT: movdqa %xmm11, %xmm1
2042 ; SSE-NEXT: pandn %xmm0, %xmm1
2043 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2044 ; SSE-NEXT: movdqa %xmm10, %xmm0
2045 ; SSE-NEXT: pandn %xmm3, %xmm0
2046 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2047 ; SSE-NEXT: movdqa %xmm2, %xmm3
2048 ; SSE-NEXT: movdqa %xmm2, %xmm4
2049 ; SSE-NEXT: pandn %xmm9, %xmm3
2050 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2051 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2052 ; SSE-NEXT: movdqa %xmm7, %xmm3
2053 ; SSE-NEXT: pandn %xmm9, %xmm3
2054 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2055 ; SSE-NEXT: movdqa %xmm14, %xmm2
2056 ; SSE-NEXT: pandn %xmm9, %xmm2
2057 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2058 ; SSE-NEXT: movdqa %xmm10, %xmm2
2059 ; SSE-NEXT: pandn %xmm9, %xmm2
2060 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2061 ; SSE-NEXT: pand %xmm10, %xmm9
2062 ; SSE-NEXT: por %xmm0, %xmm9
2063 ; SSE-NEXT: movdqa %xmm9, %xmm0
2064 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
2065 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,0,0,65535,65535]
2066 ; SSE-NEXT: movdqa %xmm8, %xmm2
2067 ; SSE-NEXT: pandn %xmm0, %xmm2
2068 ; SSE-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
2069 ; SSE-NEXT: pand %xmm8, %xmm9
2070 ; SSE-NEXT: por %xmm2, %xmm9
2071 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,1,3,4,5,6,7]
2072 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2073 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2074 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
2075 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,7]
2076 ; SSE-NEXT: packuswb %xmm0, %xmm0
2077 ; SSE-NEXT: pand %xmm11, %xmm0
2078 ; SSE-NEXT: por %xmm1, %xmm0
2079 ; SSE-NEXT: movdqa 224(%rdi), %xmm3
2080 ; SSE-NEXT: movdqa %xmm3, %xmm2
2081 ; SSE-NEXT: pxor %xmm1, %xmm1
2082 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2083 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2084 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2085 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2086 ; SSE-NEXT: pxor %xmm9, %xmm9
2087 ; SSE-NEXT: movdqa %xmm3, %xmm1
2088 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0]
2089 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3]
2090 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2091 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2092 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2093 ; SSE-NEXT: packuswb %xmm1, %xmm1
2094 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2095 ; SSE-NEXT: movdqa %xmm6, %xmm2
2096 ; SSE-NEXT: pandn %xmm1, %xmm2
2097 ; SSE-NEXT: pand %xmm6, %xmm0
2098 ; SSE-NEXT: por %xmm0, %xmm2
2099 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2100 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
2101 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; SSE-NEXT: movdqa %xmm14, %xmm0
2103 ; SSE-NEXT: pandn %xmm1, %xmm0
2104 ; SSE-NEXT: movdqa 48(%rdi), %xmm15
2105 ; SSE-NEXT: movdqa %xmm15, %xmm1
2106 ; SSE-NEXT: pand %xmm14, %xmm1
2107 ; SSE-NEXT: por %xmm0, %xmm1
2108 ; SSE-NEXT: movdqa %xmm1, %xmm0
2109 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2110 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
2111 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2112 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2113 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2114 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
2115 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2116 ; SSE-NEXT: packuswb %xmm1, %xmm0
2117 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
2118 ; SSE-NEXT: movdqa %xmm11, %xmm1
2119 ; SSE-NEXT: pandn %xmm0, %xmm1
2120 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
2121 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2122 ; SSE-NEXT: movdqa %xmm10, %xmm2
2123 ; SSE-NEXT: pandn %xmm0, %xmm2
2124 ; SSE-NEXT: movdqa (%rdi), %xmm3
2125 ; SSE-NEXT: movdqa %xmm4, %xmm0
2126 ; SSE-NEXT: pandn %xmm3, %xmm4
2127 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2128 ; SSE-NEXT: movdqa %xmm7, %xmm4
2129 ; SSE-NEXT: pandn %xmm3, %xmm4
2130 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2131 ; SSE-NEXT: movdqa %xmm14, %xmm4
2132 ; SSE-NEXT: pandn %xmm3, %xmm4
2133 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2134 ; SSE-NEXT: movdqa %xmm10, %xmm4
2135 ; SSE-NEXT: pandn %xmm3, %xmm4
2136 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2137 ; SSE-NEXT: pand %xmm10, %xmm3
2138 ; SSE-NEXT: por %xmm2, %xmm3
2139 ; SSE-NEXT: movdqa %xmm3, %xmm2
2140 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2141 ; SSE-NEXT: movdqa %xmm8, %xmm4
2142 ; SSE-NEXT: pandn %xmm2, %xmm4
2143 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
2144 ; SSE-NEXT: pand %xmm8, %xmm3
2145 ; SSE-NEXT: por %xmm4, %xmm3
2146 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm3[0,2,1,3,4,5,6,7]
2147 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2148 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2149 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
2150 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2151 ; SSE-NEXT: packuswb %xmm2, %xmm2
2152 ; SSE-NEXT: pand %xmm11, %xmm2
2153 ; SSE-NEXT: por %xmm1, %xmm2
2154 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
2155 ; SSE-NEXT: movdqa %xmm1, %xmm3
2156 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2157 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2159 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2160 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[0,0]
2161 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,3]
2162 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
2163 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2164 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
2165 ; SSE-NEXT: packuswb %xmm1, %xmm1
2166 ; SSE-NEXT: movdqa %xmm6, %xmm3
2167 ; SSE-NEXT: pandn %xmm1, %xmm3
2168 ; SSE-NEXT: pand %xmm6, %xmm2
2169 ; SSE-NEXT: por %xmm2, %xmm3
2170 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2171 ; SSE-NEXT: movdqa 272(%rdi), %xmm2
2172 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2173 ; SSE-NEXT: movdqa %xmm14, %xmm1
2174 ; SSE-NEXT: pandn %xmm2, %xmm1
2175 ; SSE-NEXT: movdqa 288(%rdi), %xmm13
2176 ; SSE-NEXT: movdqa %xmm13, %xmm2
2177 ; SSE-NEXT: pand %xmm14, %xmm2
2178 ; SSE-NEXT: por %xmm1, %xmm2
2179 ; SSE-NEXT: movdqa %xmm2, %xmm1
2180 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2181 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
2182 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
2183 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2184 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2185 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
2186 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
2187 ; SSE-NEXT: packuswb %xmm2, %xmm1
2188 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,3]
2189 ; SSE-NEXT: movdqa %xmm11, %xmm2
2190 ; SSE-NEXT: pandn %xmm1, %xmm2
2191 ; SSE-NEXT: movdqa 256(%rdi), %xmm1
2192 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
2193 ; SSE-NEXT: movdqa %xmm10, %xmm4
2194 ; SSE-NEXT: pandn %xmm1, %xmm4
2195 ; SSE-NEXT: movdqa 240(%rdi), %xmm3
2196 ; SSE-NEXT: movdqa %xmm0, %xmm1
2197 ; SSE-NEXT: pandn %xmm3, %xmm1
2198 ; SSE-NEXT: pandn %xmm3, %xmm7
2199 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2200 ; SSE-NEXT: movdqa %xmm14, %xmm7
2201 ; SSE-NEXT: pandn %xmm3, %xmm7
2202 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2203 ; SSE-NEXT: movdqa %xmm10, %xmm7
2204 ; SSE-NEXT: pandn %xmm3, %xmm7
2205 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2206 ; SSE-NEXT: pand %xmm10, %xmm3
2207 ; SSE-NEXT: por %xmm4, %xmm3
2208 ; SSE-NEXT: movdqa %xmm3, %xmm4
2209 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2210 ; SSE-NEXT: movdqa %xmm8, %xmm7
2211 ; SSE-NEXT: pandn %xmm4, %xmm7
2212 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
2213 ; SSE-NEXT: pand %xmm8, %xmm3
2214 ; SSE-NEXT: por %xmm7, %xmm3
2215 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
2216 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
2217 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
2218 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
2219 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7]
2220 ; SSE-NEXT: packuswb %xmm3, %xmm3
2221 ; SSE-NEXT: pand %xmm11, %xmm3
2222 ; SSE-NEXT: por %xmm2, %xmm3
2223 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
2224 ; SSE-NEXT: movdqa %xmm2, %xmm4
2225 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2226 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2227 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2228 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2229 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[0,0]
2230 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,3]
2231 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
2232 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
2233 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
2234 ; SSE-NEXT: packuswb %xmm2, %xmm2
2235 ; SSE-NEXT: movdqa %xmm6, %xmm4
2236 ; SSE-NEXT: pandn %xmm2, %xmm4
2237 ; SSE-NEXT: pand %xmm6, %xmm3
2238 ; SSE-NEXT: por %xmm3, %xmm4
2239 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2240 ; SSE-NEXT: movdqa 112(%rdi), %xmm3
2241 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2242 ; SSE-NEXT: movdqa %xmm14, %xmm2
2243 ; SSE-NEXT: pandn %xmm3, %xmm2
2244 ; SSE-NEXT: movdqa 128(%rdi), %xmm3
2245 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2246 ; SSE-NEXT: pand %xmm14, %xmm3
2247 ; SSE-NEXT: por %xmm2, %xmm3
2248 ; SSE-NEXT: movdqa %xmm3, %xmm2
2249 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2250 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,1,3]
2251 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2252 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
2253 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
2254 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
2255 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2256 ; SSE-NEXT: packuswb %xmm3, %xmm2
2257 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,3]
2258 ; SSE-NEXT: movdqa %xmm11, %xmm3
2259 ; SSE-NEXT: pandn %xmm2, %xmm3
2260 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
2261 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2262 ; SSE-NEXT: movdqa %xmm10, %xmm2
2263 ; SSE-NEXT: pandn %xmm4, %xmm2
2264 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
2265 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2266 ; SSE-NEXT: pand %xmm10, %xmm4
2267 ; SSE-NEXT: por %xmm2, %xmm4
2268 ; SSE-NEXT: movdqa %xmm4, %xmm2
2269 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
2270 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2271 ; SSE-NEXT: pand %xmm8, %xmm4
2272 ; SSE-NEXT: pandn %xmm2, %xmm8
2273 ; SSE-NEXT: por %xmm4, %xmm8
2274 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[0,2,1,3,4,5,6,7]
2275 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2276 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2277 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,1,3,4,5,6,7]
2278 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,7]
2279 ; SSE-NEXT: packuswb %xmm2, %xmm2
2280 ; SSE-NEXT: pand %xmm11, %xmm2
2281 ; SSE-NEXT: por %xmm3, %xmm2
2282 ; SSE-NEXT: movdqa 144(%rdi), %xmm12
2283 ; SSE-NEXT: movdqa %xmm12, %xmm4
2284 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2285 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2286 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
2287 ; SSE-NEXT: movdqa %xmm12, %xmm3
2288 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2289 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0]
2290 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,3]
2291 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2292 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
2293 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
2294 ; SSE-NEXT: packuswb %xmm3, %xmm3
2295 ; SSE-NEXT: movdqa %xmm6, %xmm14
2296 ; SSE-NEXT: movdqa %xmm6, %xmm4
2297 ; SSE-NEXT: pandn %xmm3, %xmm4
2298 ; SSE-NEXT: pand %xmm6, %xmm2
2299 ; SSE-NEXT: por %xmm2, %xmm4
2300 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2301 ; SSE-NEXT: movdqa %xmm10, %xmm2
2302 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2303 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2304 ; SSE-NEXT: pand %xmm10, %xmm3
2305 ; SSE-NEXT: por %xmm2, %xmm3
2306 ; SSE-NEXT: movdqa %xmm3, %xmm2
2307 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2308 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2309 ; SSE-NEXT: movdqa %xmm3, %xmm4
2310 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[0,0]
2311 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[2,3]
2312 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
2313 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
2314 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
2315 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
2316 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
2317 ; SSE-NEXT: psllq $48, %xmm3
2318 ; SSE-NEXT: packuswb %xmm2, %xmm3
2319 ; SSE-NEXT: movdqa %xmm11, %xmm4
2320 ; SSE-NEXT: pandn %xmm3, %xmm4
2321 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2322 ; SSE-NEXT: movdqa %xmm6, %xmm3
2323 ; SSE-NEXT: pand %xmm0, %xmm3
2324 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2325 ; SSE-NEXT: movdqa %xmm3, %xmm7
2326 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
2327 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,0,65535,65535,65535,0]
2328 ; SSE-NEXT: movdqa %xmm2, %xmm8
2329 ; SSE-NEXT: pandn %xmm7, %xmm8
2330 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2331 ; SSE-NEXT: pand %xmm2, %xmm3
2332 ; SSE-NEXT: por %xmm8, %xmm3
2333 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
2334 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,5]
2335 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
2336 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
2337 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,5,7]
2338 ; SSE-NEXT: packuswb %xmm3, %xmm3
2339 ; SSE-NEXT: pand %xmm11, %xmm3
2340 ; SSE-NEXT: por %xmm4, %xmm3
2341 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2342 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2343 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm7[3,0]
2344 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm4[0,2]
2345 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,7,6,7]
2346 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
2347 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
2348 ; SSE-NEXT: packuswb %xmm4, %xmm4
2349 ; SSE-NEXT: movdqa %xmm14, %xmm7
2350 ; SSE-NEXT: pandn %xmm4, %xmm7
2351 ; SSE-NEXT: pand %xmm14, %xmm3
2352 ; SSE-NEXT: por %xmm3, %xmm7
2353 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2354 ; SSE-NEXT: movdqa %xmm10, %xmm3
2355 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2356 ; SSE-NEXT: movdqa %xmm15, %xmm4
2357 ; SSE-NEXT: pand %xmm10, %xmm4
2358 ; SSE-NEXT: movdqa %xmm10, %xmm5
2359 ; SSE-NEXT: por %xmm3, %xmm4
2360 ; SSE-NEXT: movdqa %xmm4, %xmm3
2361 ; SSE-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
2362 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2363 ; SSE-NEXT: movdqa %xmm4, %xmm7
2364 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[0,0]
2365 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm3[2,3]
2366 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,1,3]
2367 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,6,5,6,7]
2368 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
2369 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,3,0,4,5,6,7]
2370 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
2371 ; SSE-NEXT: psllq $48, %xmm4
2372 ; SSE-NEXT: packuswb %xmm3, %xmm4
2373 ; SSE-NEXT: movdqa %xmm11, %xmm3
2374 ; SSE-NEXT: pandn %xmm4, %xmm3
2375 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2376 ; SSE-NEXT: movdqa %xmm10, %xmm4
2377 ; SSE-NEXT: movdqa %xmm0, %xmm8
2378 ; SSE-NEXT: pand %xmm0, %xmm4
2379 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2380 ; SSE-NEXT: movdqa %xmm4, %xmm0
2381 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
2382 ; SSE-NEXT: movdqa %xmm2, %xmm7
2383 ; SSE-NEXT: pandn %xmm0, %xmm7
2384 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2385 ; SSE-NEXT: pand %xmm2, %xmm4
2386 ; SSE-NEXT: por %xmm7, %xmm4
2387 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,1,3]
2388 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2389 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
2390 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2391 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,5,7]
2392 ; SSE-NEXT: packuswb %xmm0, %xmm0
2393 ; SSE-NEXT: pand %xmm11, %xmm0
2394 ; SSE-NEXT: por %xmm3, %xmm0
2395 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2397 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[3,0]
2398 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
2399 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,7,6,7]
2400 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2401 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
2402 ; SSE-NEXT: packuswb %xmm3, %xmm3
2403 ; SSE-NEXT: movdqa %xmm14, %xmm4
2404 ; SSE-NEXT: pandn %xmm3, %xmm4
2405 ; SSE-NEXT: pand %xmm14, %xmm0
2406 ; SSE-NEXT: por %xmm0, %xmm4
2407 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2408 ; SSE-NEXT: movdqa %xmm5, %xmm0
2409 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2410 ; SSE-NEXT: movdqa %xmm13, %xmm3
2411 ; SSE-NEXT: pand %xmm5, %xmm3
2412 ; SSE-NEXT: por %xmm0, %xmm3
2413 ; SSE-NEXT: movdqa %xmm3, %xmm0
2414 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
2415 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2416 ; SSE-NEXT: movdqa %xmm3, %xmm4
2417 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm0[0,0]
2418 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,3]
2419 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2,1,3]
2420 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
2421 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
2422 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
2423 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
2424 ; SSE-NEXT: psllq $48, %xmm3
2425 ; SSE-NEXT: packuswb %xmm0, %xmm3
2426 ; SSE-NEXT: movdqa %xmm11, %xmm0
2427 ; SSE-NEXT: pandn %xmm3, %xmm0
2428 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
2429 ; SSE-NEXT: pand %xmm8, %xmm3
2430 ; SSE-NEXT: movdqa %xmm8, %xmm7
2431 ; SSE-NEXT: por %xmm1, %xmm3
2432 ; SSE-NEXT: movdqa %xmm3, %xmm1
2433 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2434 ; SSE-NEXT: movdqa %xmm2, %xmm4
2435 ; SSE-NEXT: pandn %xmm1, %xmm4
2436 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2437 ; SSE-NEXT: pand %xmm2, %xmm3
2438 ; SSE-NEXT: por %xmm4, %xmm3
2439 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,1,3]
2440 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2441 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2442 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2443 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
2444 ; SSE-NEXT: packuswb %xmm1, %xmm1
2445 ; SSE-NEXT: pand %xmm11, %xmm1
2446 ; SSE-NEXT: por %xmm0, %xmm1
2447 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2448 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2449 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[3,0]
2450 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[0,2]
2451 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,7,6,7]
2452 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2453 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2454 ; SSE-NEXT: packuswb %xmm0, %xmm0
2455 ; SSE-NEXT: movdqa %xmm14, %xmm3
2456 ; SSE-NEXT: pandn %xmm0, %xmm3
2457 ; SSE-NEXT: pand %xmm14, %xmm1
2458 ; SSE-NEXT: por %xmm1, %xmm3
2459 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2460 ; SSE-NEXT: movdqa %xmm5, %xmm1
2461 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2462 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2463 ; SSE-NEXT: pand %xmm5, %xmm0
2464 ; SSE-NEXT: movdqa %xmm5, %xmm8
2465 ; SSE-NEXT: por %xmm1, %xmm0
2466 ; SSE-NEXT: movdqa %xmm0, %xmm1
2467 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2468 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2469 ; SSE-NEXT: movdqa %xmm0, %xmm3
2470 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm1[0,0]
2471 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,3]
2472 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,1,3]
2473 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,7]
2474 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,1]
2475 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2476 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
2477 ; SSE-NEXT: psllq $48, %xmm0
2478 ; SSE-NEXT: packuswb %xmm1, %xmm0
2479 ; SSE-NEXT: movdqa %xmm7, %xmm4
2480 ; SSE-NEXT: movdqa %xmm7, %xmm1
2481 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2482 ; SSE-NEXT: pandn %xmm5, %xmm1
2483 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2484 ; SSE-NEXT: pand %xmm7, %xmm3
2485 ; SSE-NEXT: por %xmm1, %xmm3
2486 ; SSE-NEXT: movdqa %xmm3, %xmm1
2487 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2488 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
2489 ; SSE-NEXT: pand %xmm2, %xmm3
2490 ; SSE-NEXT: pandn %xmm1, %xmm2
2491 ; SSE-NEXT: por %xmm3, %xmm2
2492 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
2493 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2494 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
2495 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
2496 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,4,5,7]
2497 ; SSE-NEXT: packuswb %xmm1, %xmm1
2498 ; SSE-NEXT: pand %xmm11, %xmm1
2499 ; SSE-NEXT: pandn %xmm0, %xmm11
2500 ; SSE-NEXT: por %xmm11, %xmm1
2501 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2502 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm2[3,0]
2503 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[0,2]
2504 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,7,6,7]
2505 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2506 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
2507 ; SSE-NEXT: packuswb %xmm0, %xmm0
2508 ; SSE-NEXT: movdqa %xmm14, %xmm2
2509 ; SSE-NEXT: pandn %xmm0, %xmm2
2510 ; SSE-NEXT: pand %xmm14, %xmm1
2511 ; SSE-NEXT: por %xmm1, %xmm2
2512 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2513 ; SSE-NEXT: movdqa %xmm6, %xmm1
2514 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
2515 ; SSE-NEXT: pand %xmm11, %xmm1
2516 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2517 ; SSE-NEXT: movdqa %xmm1, %xmm2
2518 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2519 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
2520 ; SSE-NEXT: movdqa %xmm6, %xmm3
2521 ; SSE-NEXT: pandn %xmm2, %xmm3
2522 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2523 ; SSE-NEXT: pand %xmm6, %xmm1
2524 ; SSE-NEXT: por %xmm3, %xmm1
2525 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2526 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2527 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2528 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2529 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2530 ; SSE-NEXT: packuswb %xmm1, %xmm1
2531 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [0,0,0,65535,65535,65535,65535,65535]
2532 ; SSE-NEXT: movdqa %xmm12, %xmm2
2533 ; SSE-NEXT: pandn %xmm1, %xmm2
2534 ; SSE-NEXT: movdqa %xmm8, %xmm1
2535 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2536 ; SSE-NEXT: movdqa %xmm4, %xmm0
2537 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2538 ; SSE-NEXT: pandn %xmm4, %xmm0
2539 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2540 ; SSE-NEXT: movdqa %xmm11, %xmm3
2541 ; SSE-NEXT: pandn %xmm4, %xmm3
2542 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2543 ; SSE-NEXT: pand %xmm8, %xmm4
2544 ; SSE-NEXT: por %xmm1, %xmm4
2545 ; SSE-NEXT: movdqa %xmm4, %xmm1
2546 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2547 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2548 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
2549 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
2550 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
2551 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2552 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2553 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2554 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2555 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
2556 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
2557 ; SSE-NEXT: packuswb %xmm1, %xmm4
2558 ; SSE-NEXT: pand %xmm12, %xmm4
2559 ; SSE-NEXT: por %xmm2, %xmm4
2560 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2561 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
2562 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2563 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
2564 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2565 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2566 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2567 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2568 ; SSE-NEXT: packuswb %xmm1, %xmm1
2569 ; SSE-NEXT: movdqa %xmm14, %xmm3
2570 ; SSE-NEXT: pandn %xmm1, %xmm3
2571 ; SSE-NEXT: pand %xmm14, %xmm4
2572 ; SSE-NEXT: por %xmm4, %xmm3
2573 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2574 ; SSE-NEXT: pand %xmm11, %xmm10
2575 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2576 ; SSE-NEXT: movdqa %xmm10, %xmm2
2577 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2578 ; SSE-NEXT: movdqa %xmm6, %xmm4
2579 ; SSE-NEXT: pandn %xmm2, %xmm4
2580 ; SSE-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
2581 ; SSE-NEXT: pand %xmm6, %xmm10
2582 ; SSE-NEXT: por %xmm4, %xmm10
2583 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,1,2,3,4,5,6,7]
2584 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2585 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2586 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2587 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2588 ; SSE-NEXT: packuswb %xmm1, %xmm1
2589 ; SSE-NEXT: movdqa %xmm12, %xmm2
2590 ; SSE-NEXT: pandn %xmm1, %xmm2
2591 ; SSE-NEXT: movdqa %xmm8, %xmm1
2592 ; SSE-NEXT: pandn %xmm15, %xmm1
2593 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2594 ; SSE-NEXT: movdqa %xmm10, %xmm0
2595 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2596 ; SSE-NEXT: pandn %xmm4, %xmm0
2597 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2598 ; SSE-NEXT: movdqa %xmm11, %xmm3
2599 ; SSE-NEXT: pandn %xmm4, %xmm3
2600 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2601 ; SSE-NEXT: pand %xmm8, %xmm4
2602 ; SSE-NEXT: por %xmm1, %xmm4
2603 ; SSE-NEXT: movdqa %xmm4, %xmm1
2604 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2605 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2606 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,1,2,0]
2607 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[3,0]
2608 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,2]
2609 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2610 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2611 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
2612 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2613 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
2614 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,5,6,5]
2615 ; SSE-NEXT: packuswb %xmm1, %xmm4
2616 ; SSE-NEXT: pand %xmm12, %xmm4
2617 ; SSE-NEXT: por %xmm2, %xmm4
2618 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2619 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
2620 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2621 ; SSE-NEXT: # xmm2 = mem[0,2,2,3]
2622 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2623 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
2624 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2625 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,5]
2626 ; SSE-NEXT: packuswb %xmm1, %xmm1
2627 ; SSE-NEXT: movdqa %xmm14, %xmm2
2628 ; SSE-NEXT: pandn %xmm1, %xmm2
2629 ; SSE-NEXT: pand %xmm14, %xmm4
2630 ; SSE-NEXT: por %xmm4, %xmm2
2631 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2632 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
2633 ; SSE-NEXT: pand %xmm11, %xmm1
2634 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2635 ; SSE-NEXT: movdqa %xmm1, %xmm2
2636 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2637 ; SSE-NEXT: movdqa %xmm6, %xmm4
2638 ; SSE-NEXT: pandn %xmm2, %xmm4
2639 ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
2640 ; SSE-NEXT: pand %xmm6, %xmm1
2641 ; SSE-NEXT: por %xmm4, %xmm1
2642 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2643 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
2644 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
2645 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
2646 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2647 ; SSE-NEXT: packuswb %xmm1, %xmm1
2648 ; SSE-NEXT: movdqa %xmm12, %xmm2
2649 ; SSE-NEXT: pandn %xmm1, %xmm2
2650 ; SSE-NEXT: movdqa %xmm8, %xmm4
2651 ; SSE-NEXT: pandn %xmm13, %xmm4
2652 ; SSE-NEXT: movdqa %xmm10, %xmm0
2653 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2654 ; SSE-NEXT: pandn %xmm7, %xmm0
2655 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2656 ; SSE-NEXT: movdqa %xmm11, %xmm1
2657 ; SSE-NEXT: pandn %xmm7, %xmm1
2658 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2659 ; SSE-NEXT: pand %xmm8, %xmm7
2660 ; SSE-NEXT: movdqa %xmm8, %xmm10
2661 ; SSE-NEXT: por %xmm4, %xmm7
2662 ; SSE-NEXT: movdqa %xmm7, %xmm4
2663 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
2664 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
2665 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,1,2,0]
2666 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[3,0]
2667 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[0,2]
2668 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
2669 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
2670 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
2671 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
2672 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
2673 ; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,5]
2674 ; SSE-NEXT: packuswb %xmm4, %xmm7
2675 ; SSE-NEXT: pand %xmm12, %xmm7
2676 ; SSE-NEXT: por %xmm2, %xmm7
2677 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2678 ; SSE-NEXT: # xmm2 = mem[1,1,1,1]
2679 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2680 ; SSE-NEXT: # xmm4 = mem[0,2,2,3]
2681 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
2682 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7]
2683 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2684 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
2685 ; SSE-NEXT: packuswb %xmm2, %xmm2
2686 ; SSE-NEXT: movdqa %xmm14, %xmm1
2687 ; SSE-NEXT: pandn %xmm2, %xmm1
2688 ; SSE-NEXT: pand %xmm14, %xmm7
2689 ; SSE-NEXT: por %xmm7, %xmm1
2690 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2691 ; SSE-NEXT: movdqa %xmm11, %xmm8
2692 ; SSE-NEXT: movdqa %xmm11, %xmm2
2693 ; SSE-NEXT: pandn %xmm5, %xmm2
2694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2695 ; SSE-NEXT: pand %xmm11, %xmm4
2696 ; SSE-NEXT: por %xmm2, %xmm4
2697 ; SSE-NEXT: movdqa %xmm4, %xmm2
2698 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
2699 ; SSE-NEXT: movdqa %xmm6, %xmm7
2700 ; SSE-NEXT: pandn %xmm2, %xmm7
2701 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
2702 ; SSE-NEXT: pand %xmm6, %xmm4
2703 ; SSE-NEXT: por %xmm7, %xmm4
2704 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,1,2,3,4,5,6,7]
2705 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
2706 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
2707 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,1,4,5,6,7]
2708 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,6,7]
2709 ; SSE-NEXT: packuswb %xmm4, %xmm4
2710 ; SSE-NEXT: movdqa %xmm12, %xmm3
2711 ; SSE-NEXT: pandn %xmm4, %xmm3
2712 ; SSE-NEXT: movdqa %xmm10, %xmm7
2713 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2714 ; SSE-NEXT: pandn %xmm5, %xmm7
2715 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2716 ; SSE-NEXT: movdqa %xmm0, %xmm14
2717 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
2718 ; SSE-NEXT: pand %xmm1, %xmm14
2719 ; SSE-NEXT: movdqa %xmm15, %xmm11
2720 ; SSE-NEXT: pand %xmm1, %xmm11
2721 ; SSE-NEXT: movdqa %xmm13, %xmm4
2722 ; SSE-NEXT: pand %xmm1, %xmm4
2723 ; SSE-NEXT: movdqa %xmm5, %xmm2
2724 ; SSE-NEXT: pand %xmm1, %xmm2
2725 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2726 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2727 ; SSE-NEXT: pandn %xmm2, %xmm1
2728 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2729 ; SSE-NEXT: pand %xmm8, %xmm0
2730 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2731 ; SSE-NEXT: pand %xmm8, %xmm15
2732 ; SSE-NEXT: pand %xmm8, %xmm13
2733 ; SSE-NEXT: pand %xmm8, %xmm5
2734 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2735 ; SSE-NEXT: movdqa %xmm2, %xmm0
2736 ; SSE-NEXT: pandn %xmm2, %xmm8
2737 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2738 ; SSE-NEXT: pand %xmm10, %xmm0
2739 ; SSE-NEXT: por %xmm7, %xmm0
2740 ; SSE-NEXT: movdqa %xmm0, %xmm7
2741 ; SSE-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
2742 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2743 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0]
2744 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[3,0]
2745 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
2746 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,1,2,3,4,5,6,7]
2747 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
2748 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2749 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
2750 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
2751 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
2752 ; SSE-NEXT: packuswb %xmm0, %xmm1
2753 ; SSE-NEXT: pand %xmm12, %xmm1
2754 ; SSE-NEXT: por %xmm3, %xmm1
2755 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2756 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2757 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2758 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
2759 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2760 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7]
2761 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
2762 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
2763 ; SSE-NEXT: packuswb %xmm0, %xmm0
2764 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
2765 ; SSE-NEXT: movdqa %xmm10, %xmm2
2766 ; SSE-NEXT: pandn %xmm0, %xmm2
2767 ; SSE-NEXT: pand %xmm10, %xmm1
2768 ; SSE-NEXT: por %xmm1, %xmm2
2769 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2770 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2771 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
2772 ; SSE-NEXT: pand %xmm3, %xmm0
2773 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2774 ; SSE-NEXT: movdqa %xmm0, %xmm1
2775 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
2776 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2777 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2778 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2779 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
2780 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
2781 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
2782 ; SSE-NEXT: packuswb %xmm0, %xmm0
2783 ; SSE-NEXT: movdqa %xmm12, %xmm1
2784 ; SSE-NEXT: pandn %xmm0, %xmm1
2785 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
2786 ; SSE-NEXT: movdqa %xmm14, %xmm0
2787 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
2788 ; SSE-NEXT: punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
2789 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm0[2,0]
2790 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,6,5,6,7]
2791 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2792 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2793 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2794 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2795 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
2796 ; SSE-NEXT: packuswb %xmm2, %xmm0
2797 ; SSE-NEXT: pand %xmm12, %xmm0
2798 ; SSE-NEXT: por %xmm1, %xmm0
2799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2800 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2801 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
2802 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
2803 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
2804 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2805 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
2806 ; SSE-NEXT: packuswb %xmm1, %xmm1
2807 ; SSE-NEXT: movdqa %xmm10, %xmm9
2808 ; SSE-NEXT: movdqa %xmm10, %xmm14
2809 ; SSE-NEXT: pandn %xmm1, %xmm14
2810 ; SSE-NEXT: pand %xmm10, %xmm0
2811 ; SSE-NEXT: por %xmm0, %xmm14
2812 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2813 ; SSE-NEXT: pand %xmm3, %xmm0
2814 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2815 ; SSE-NEXT: movdqa %xmm0, %xmm1
2816 ; SSE-NEXT: pxor %xmm2, %xmm2
2817 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2818 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2819 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2820 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2821 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
2822 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
2823 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
2824 ; SSE-NEXT: packuswb %xmm0, %xmm0
2825 ; SSE-NEXT: movdqa %xmm12, %xmm1
2826 ; SSE-NEXT: pandn %xmm0, %xmm1
2827 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2828 ; SSE-NEXT: movdqa %xmm11, %xmm0
2829 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2830 ; SSE-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
2831 ; SSE-NEXT: pxor %xmm10, %xmm10
2832 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm0[2,0]
2833 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,6,5,6,7]
2834 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2835 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2836 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2837 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2838 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
2839 ; SSE-NEXT: packuswb %xmm2, %xmm0
2840 ; SSE-NEXT: pand %xmm12, %xmm0
2841 ; SSE-NEXT: por %xmm1, %xmm0
2842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2843 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2844 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
2845 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
2846 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
2847 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2848 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
2849 ; SSE-NEXT: packuswb %xmm1, %xmm1
2850 ; SSE-NEXT: movdqa %xmm9, %xmm11
2851 ; SSE-NEXT: pandn %xmm1, %xmm11
2852 ; SSE-NEXT: pand %xmm9, %xmm0
2853 ; SSE-NEXT: por %xmm0, %xmm11
2854 ; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
2855 ; SSE-NEXT: pand %xmm3, %xmm0
2856 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2857 ; SSE-NEXT: movdqa %xmm0, %xmm1
2858 ; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm10[8],xmm1[9],xmm10[9],xmm1[10],xmm10[10],xmm1[11],xmm10[11],xmm1[12],xmm10[12],xmm1[13],xmm10[13],xmm1[14],xmm10[14],xmm1[15],xmm10[15]
2859 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2860 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,0]
2861 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
2862 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
2863 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,0,1,2,4,5,6,7]
2864 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7]
2865 ; SSE-NEXT: packuswb %xmm0, %xmm0
2866 ; SSE-NEXT: movdqa %xmm12, %xmm1
2867 ; SSE-NEXT: pandn %xmm0, %xmm1
2868 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2869 ; SSE-NEXT: movdqa %xmm4, %xmm0
2870 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
2871 ; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm10[8],xmm4[9],xmm10[9],xmm4[10],xmm10[10],xmm4[11],xmm10[11],xmm4[12],xmm10[12],xmm4[13],xmm10[13],xmm4[14],xmm10[14],xmm4[15],xmm10[15]
2872 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[2,0]
2873 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
2874 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2875 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2876 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2877 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2878 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
2879 ; SSE-NEXT: packuswb %xmm2, %xmm0
2880 ; SSE-NEXT: pand %xmm12, %xmm0
2881 ; SSE-NEXT: por %xmm1, %xmm0
2882 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2883 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2884 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0]
2885 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
2886 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,6,6,7]
2887 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2888 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,5]
2889 ; SSE-NEXT: packuswb %xmm1, %xmm2
2890 ; SSE-NEXT: movdqa %xmm9, %xmm10
2891 ; SSE-NEXT: pandn %xmm2, %xmm10
2892 ; SSE-NEXT: pand %xmm9, %xmm0
2893 ; SSE-NEXT: por %xmm0, %xmm10
2894 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2895 ; SSE-NEXT: movdqa %xmm3, %xmm2
2896 ; SSE-NEXT: pand %xmm3, %xmm0
2897 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2898 ; SSE-NEXT: por %xmm0, %xmm2
2899 ; SSE-NEXT: movdqa %xmm2, %xmm0
2900 ; SSE-NEXT: pxor %xmm1, %xmm1
2901 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2902 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2903 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
2904 ; SSE-NEXT: movaps %xmm2, %xmm4
2905 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2906 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2907 ; SSE-NEXT: movdqa %xmm2, %xmm0
2908 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2909 ; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2910 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,0]
2911 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2912 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
2913 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2914 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
2915 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2916 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
2917 ; SSE-NEXT: packuswb %xmm2, %xmm0
2918 ; SSE-NEXT: pand %xmm12, %xmm0
2919 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,4,5,6,5]
2920 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
2921 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,0,1,2,4,5,6,7]
2922 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
2923 ; SSE-NEXT: packuswb %xmm2, %xmm2
2924 ; SSE-NEXT: pandn %xmm2, %xmm12
2925 ; SSE-NEXT: por %xmm12, %xmm0
2926 ; SSE-NEXT: movdqa %xmm8, %xmm3
2927 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm8[3,0]
2928 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2]
2929 ; SSE-NEXT: pand %xmm9, %xmm0
2930 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
2931 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
2932 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,5]
2933 ; SSE-NEXT: packuswb %xmm2, %xmm2
2934 ; SSE-NEXT: pandn %xmm2, %xmm9
2935 ; SSE-NEXT: por %xmm0, %xmm9
2936 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2937 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2938 ; SSE-NEXT: movdqa %xmm3, %xmm0
2939 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2940 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
2941 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,1,3]
2942 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[1,2]
2943 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2,3,1]
2944 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
2945 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2946 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2947 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
2948 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2949 ; SSE-NEXT: packuswb %xmm0, %xmm2
2950 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
2951 ; SSE-NEXT: movdqa %xmm4, %xmm3
2952 ; SSE-NEXT: pandn %xmm2, %xmm3
2953 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2954 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
2955 ; SSE-NEXT: pand %xmm12, %xmm8
2956 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2957 ; SSE-NEXT: movdqa %xmm8, %xmm2
2958 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2959 ; SSE-NEXT: movdqa %xmm6, %xmm7
2960 ; SSE-NEXT: pandn %xmm2, %xmm7
2961 ; SSE-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
2962 ; SSE-NEXT: pand %xmm6, %xmm8
2963 ; SSE-NEXT: por %xmm7, %xmm8
2964 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[3,1,0,3,4,5,6,7]
2965 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4]
2966 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
2967 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,0,4,5,6,7]
2968 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
2969 ; SSE-NEXT: packuswb %xmm2, %xmm2
2970 ; SSE-NEXT: pand %xmm4, %xmm2
2971 ; SSE-NEXT: por %xmm3, %xmm2
2972 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2973 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
2974 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2975 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
2976 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
2977 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
2978 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
2979 ; SSE-NEXT: packuswb %xmm0, %xmm7
2980 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,1]
2981 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2982 ; SSE-NEXT: movdqa %xmm15, %xmm0
2983 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2984 ; SSE-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
2985 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,1,1,3]
2986 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[1,2]
2987 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,2,3,1]
2988 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[3,1,2,3,4,5,6,7]
2989 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2990 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2991 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
2992 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2993 ; SSE-NEXT: packuswb %xmm0, %xmm3
2994 ; SSE-NEXT: movdqa %xmm4, %xmm7
2995 ; SSE-NEXT: pandn %xmm3, %xmm7
2996 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2997 ; SSE-NEXT: pand %xmm12, %xmm15
2998 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
2999 ; SSE-NEXT: movdqa %xmm15, %xmm3
3000 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3001 ; SSE-NEXT: movdqa %xmm6, %xmm8
3002 ; SSE-NEXT: pandn %xmm3, %xmm8
3003 ; SSE-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
3004 ; SSE-NEXT: pand %xmm6, %xmm15
3005 ; SSE-NEXT: por %xmm8, %xmm15
3006 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm15[3,1,0,3,4,5,6,7]
3007 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
3008 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
3009 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
3010 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm3[0,1,2,3,6,5,6,7]
3011 ; SSE-NEXT: packuswb %xmm8, %xmm8
3012 ; SSE-NEXT: pand %xmm4, %xmm8
3013 ; SSE-NEXT: por %xmm7, %xmm8
3014 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3015 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
3016 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3017 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
3018 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
3019 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
3020 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
3021 ; SSE-NEXT: packuswb %xmm0, %xmm7
3022 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm7[2,1]
3023 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3024 ; SSE-NEXT: movdqa %xmm13, %xmm0
3025 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3026 ; SSE-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3],xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
3027 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,1,3]
3028 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,3],xmm0[1,2]
3029 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2,3,1]
3030 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,1,2,3,4,5,6,7]
3031 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
3032 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3033 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,0,4,5,6,7]
3034 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
3035 ; SSE-NEXT: packuswb %xmm0, %xmm3
3036 ; SSE-NEXT: movdqa %xmm4, %xmm7
3037 ; SSE-NEXT: pandn %xmm3, %xmm7
3038 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload
3039 ; SSE-NEXT: pand %xmm12, %xmm13
3040 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3041 ; SSE-NEXT: movdqa %xmm13, %xmm3
3042 ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
3043 ; SSE-NEXT: movdqa %xmm6, %xmm5
3044 ; SSE-NEXT: pandn %xmm3, %xmm5
3045 ; SSE-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
3046 ; SSE-NEXT: pand %xmm6, %xmm13
3047 ; SSE-NEXT: por %xmm5, %xmm13
3048 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[3,1,0,3,4,5,6,7]
3049 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,4]
3050 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,3,2,1]
3051 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,0,4,5,6,7]
3052 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,6,7]
3053 ; SSE-NEXT: packuswb %xmm5, %xmm5
3054 ; SSE-NEXT: pand %xmm4, %xmm5
3055 ; SSE-NEXT: por %xmm7, %xmm5
3056 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3057 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
3058 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3059 ; SSE-NEXT: # xmm7 = mem[0,2,2,3]
3060 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
3061 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
3062 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
3063 ; SSE-NEXT: packuswb %xmm0, %xmm7
3064 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,1]
3065 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3066 ; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3067 ; SSE-NEXT: movdqa %xmm7, %xmm0
3068 ; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
3069 ; SSE-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
3070 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,1,3]
3071 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm0[1,2]
3072 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3073 ; SSE-NEXT: pand %xmm12, %xmm0
3074 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3075 ; SSE-NEXT: por %xmm0, %xmm12
3076 ; SSE-NEXT: movdqa %xmm12, %xmm0
3077 ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3078 ; SSE-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15]
3079 ; SSE-NEXT: pand %xmm6, %xmm12
3080 ; SSE-NEXT: pandn %xmm0, %xmm6
3081 ; SSE-NEXT: por %xmm12, %xmm6
3082 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[3,1,0,3,4,5,6,7]
3083 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
3084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
3085 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7]
3086 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
3087 ; SSE-NEXT: packuswb %xmm0, %xmm0
3088 ; SSE-NEXT: pand %xmm4, %xmm0
3089 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2,3,1]
3090 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
3091 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
3092 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
3093 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[2,1,3,0,4,5,6,7]
3094 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
3095 ; SSE-NEXT: packuswb %xmm6, %xmm3
3096 ; SSE-NEXT: pandn %xmm3, %xmm4
3097 ; SSE-NEXT: por %xmm4, %xmm0
3098 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3099 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
3100 ; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3101 ; SSE-NEXT: # xmm4 = mem[0,2,2,3]
3102 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
3103 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
3104 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
3105 ; SSE-NEXT: packuswb %xmm6, %xmm4
3106 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,1]
3107 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3108 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
3109 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3110 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
3111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3112 ; SSE-NEXT: movaps %xmm3, (%rsi)
3113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3114 ; SSE-NEXT: movaps %xmm3, 32(%rsi)
3115 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3116 ; SSE-NEXT: movaps %xmm3, 16(%rdx)
3117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3118 ; SSE-NEXT: movaps %xmm3, 48(%rdx)
3119 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3120 ; SSE-NEXT: movaps %xmm3, (%rdx)
3121 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3122 ; SSE-NEXT: movaps %xmm3, 32(%rdx)
3123 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3124 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3125 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3126 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
3127 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3128 ; SSE-NEXT: movaps %xmm1, (%rcx)
3129 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3130 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
3131 ; SSE-NEXT: movdqa %xmm9, 16(%r8)
3132 ; SSE-NEXT: movdqa %xmm10, 48(%r8)
3133 ; SSE-NEXT: movdqa %xmm11, (%r8)
3134 ; SSE-NEXT: movdqa %xmm14, 32(%r8)
3135 ; SSE-NEXT: movaps %xmm0, 16(%r9)
3136 ; SSE-NEXT: movaps %xmm5, 48(%r9)
3137 ; SSE-NEXT: movaps %xmm8, (%r9)
3138 ; SSE-NEXT: movaps %xmm2, 32(%r9)
3139 ; SSE-NEXT: addq $552, %rsp # imm = 0x228
3142 ; AVX1-ONLY-LABEL: load_i8_stride5_vf64:
3143 ; AVX1-ONLY: # %bb.0:
3144 ; AVX1-ONLY-NEXT: subq $488, %rsp # imm = 0x1E8
3145 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0]
3146 ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm4
3147 ; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm1
3148 ; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7
3149 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm5
3150 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm0
3151 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm11
3152 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3153 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
3154 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm4, %xmm1
3155 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm8
3156 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3157 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3158 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128]
3159 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
3160 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm0
3161 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, %xmm9
3162 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
3163 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3]
3164 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
3165 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm7, %xmm6
3166 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12
3167 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3168 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6
3169 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
3170 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
3171 ; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3172 ; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1
3173 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm2
3174 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm14
3175 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3176 ; AVX1-ONLY-NEXT: vmovdqa 160(%rdi), %xmm1
3177 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
3178 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm13
3179 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3180 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3181 ; AVX1-ONLY-NEXT: vmovdqa 208(%rdi), %xmm1
3182 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm3
3183 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm10
3184 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3185 ; AVX1-ONLY-NEXT: vmovdqa 192(%rdi), %xmm1
3186 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm4
3187 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3188 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3189 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
3190 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3191 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
3192 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3193 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
3194 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
3195 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
3196 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
3197 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
3198 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
3199 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
3200 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm6
3201 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm8 = [9,14,128,128,128,0,0,4,9,14,128,128,128,0,0,4]
3202 ; AVX1-ONLY-NEXT: # xmm8 = mem[0,0]
3203 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm12, %xmm9
3204 ; AVX1-ONLY-NEXT: vpor %xmm6, %xmm9, %xmm6
3205 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm4, %xmm6, %xmm2
3206 ; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3207 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm3
3208 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm14, %xmm4
3209 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3210 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm11 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
3211 ; AVX1-ONLY-NEXT: # xmm11 = mem[0,0]
3212 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm10, %xmm4
3213 ; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm1, %xmm5
3214 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
3215 ; AVX1-ONLY-NEXT: vmovdqa 144(%rdi), %xmm1
3216 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0
3217 ; AVX1-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3218 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm1, %xmm0
3219 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, %xmm8
3220 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
3221 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
3222 ; AVX1-ONLY-NEXT: vmovdqa 128(%rdi), %xmm13
3223 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm3
3224 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
3225 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,0,0,128,128,128,3,8,13,0,0,128,128,128,3]
3226 ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
3227 ; AVX1-ONLY-NEXT: vmovdqa 112(%rdi), %xmm1
3228 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3229 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm3
3230 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [128,128,0,0,4,9,14,128,128,128,0,0,4,9,14,128]
3231 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
3232 ; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm1
3233 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3234 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm12
3235 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm12, %xmm3
3236 ; AVX1-ONLY-NEXT: vpxor %xmm4, %xmm4, %xmm4
3237 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7]
3238 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u>
3239 ; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm14
3240 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm15
3241 ; AVX1-ONLY-NEXT: vpor %xmm15, %xmm12, %xmm12
3242 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4],xmm0[5,6,7]
3243 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3244 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
3245 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,1,6,11,0,1,6,11,0,1,6,11,0,1,6,11]
3246 ; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm1
3247 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3248 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm9
3249 ; AVX1-ONLY-NEXT: vandnps %ymm9, %ymm12, %ymm9
3250 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm9, %ymm0
3251 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0
3252 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3253 ; AVX1-ONLY-NEXT: vmovdqa 304(%rdi), %xmm0
3254 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3255 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
3256 ; AVX1-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1
3257 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3258 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm9
3259 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm9, %xmm0
3260 ; AVX1-ONLY-NEXT: vmovdqa 272(%rdi), %xmm10
3261 ; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm10, %xmm5
3262 ; AVX1-ONLY-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3263 ; AVX1-ONLY-NEXT: vmovdqa 256(%rdi), %xmm9
3264 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm9, %xmm6
3265 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
3266 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
3267 ; AVX1-ONLY-NEXT: vmovdqa 240(%rdi), %xmm1
3268 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3269 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
3270 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3271 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
3272 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
3273 ; AVX1-ONLY-NEXT: vmovdqa 224(%rdi), %xmm5
3274 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm2
3275 ; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3276 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2
3277 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2
3278 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
3279 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3280 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
3281 ; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
3282 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm0
3283 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, %xmm11
3284 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3285 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
3286 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3287 ; AVX1-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3288 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm13, %xmm4
3289 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm4, %xmm4
3290 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3291 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[u,u,u],zero,zero,zero,zero,xmm7[4,9,14,u,u,u,u,u,u]
3292 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3293 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[u,u,u,u,u,u]
3294 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm8, %xmm8
3295 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[3,4,5,6,7,8,9,u,u,u,u,u,u]
3296 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, %xmm6
3297 ; AVX1-ONLY-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3298 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u]
3299 ; AVX1-ONLY-NEXT: vpor %xmm14, %xmm8, %xmm8
3300 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
3301 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
3302 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm14 = [0,2,7,12,0,2,7,12,0,2,7,12,0,2,7,12]
3303 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3304 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm0, %xmm15
3305 ; AVX1-ONLY-NEXT: vandnps %ymm15, %ymm12, %ymm15
3306 ; AVX1-ONLY-NEXT: vorps %ymm15, %ymm8, %ymm8
3307 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm8, %ymm4
3308 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3309 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3310 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2
3311 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3312 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm3
3313 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3314 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u],zero,zero,zero,zero,xmm10[4,9,14,u,u,u,u,u,u]
3315 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,0,5,10,15],zero,zero,zero,xmm9[u,u,u,u,u,u]
3316 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, %xmm8
3317 ; AVX1-ONLY-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3318 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3319 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u>
3320 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3
3321 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3322 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u]
3323 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
3324 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
3325 ; AVX1-ONLY-NEXT: vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
3326 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm3
3327 ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3
3328 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2
3329 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
3330 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3331 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
3332 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3333 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm2, %xmm0
3334 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[u,u,u,u,u,u]
3335 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [10,15,0,128,128,128,0,5,10,15,0,128,128,128,0,5]
3336 ; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
3337 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm7, %xmm5
3338 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
3339 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm3
3340 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u>
3341 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm6, %xmm14
3342 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm14, %xmm3
3343 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7]
3344 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3345 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
3346 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,128,4,9,14,0,128,128,128,128,4,9,14,0,128,128]
3347 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3348 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
3349 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12
3350 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
3351 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
3352 ; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
3353 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3354 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0
3355 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
3356 ; AVX1-ONLY-NEXT: # xmm9 = mem[0,0]
3357 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3358 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm13
3359 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[3,4,5,6,7]
3360 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm12, %xmm0
3361 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3362 ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm12, %ymm0
3363 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3364 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
3365 ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm12, %ymm13
3366 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0
3367 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
3368 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3369 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3370 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[3,8,13]
3371 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3372 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3373 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
3374 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,1,6,11],zero,zero,zero,zero,xmm8[u,u,u,u,u,u]
3375 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3376 ; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm13, %xmm4
3377 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm4, %xmm1
3378 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,9,u,u,u,u,u,u]
3379 ; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm10, %xmm4
3380 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm1, %xmm1
3381 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
3382 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3383 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm11[1,6,11,u,u,u,u,u,u,u,u,u,u]
3384 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3385 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm3
3386 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5,6,7]
3387 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3388 ; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm10, %xmm3
3389 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3390 ; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm14, %xmm2
3391 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
3392 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3393 ; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1
3394 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3395 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13]
3396 ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2
3397 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1
3398 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3399 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3400 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
3401 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm5[u,u,u]
3402 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
3403 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,8,13],zero,zero,zero,xmm15[u,u,u,u,u,u,u,u,u,u]
3404 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,u],zero,zero,zero,xmm7[0,5,10,15,u,u,u]
3405 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5,6,7]
3406 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm3
3407 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3408 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[1,6,11,u,u,u,u,u,u,u]
3409 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3410 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
3411 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
3412 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u]
3413 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u>
3414 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3415 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm1
3416 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm0
3417 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm1 = [0,4,9,14,0,4,9,14,0,4,9,14,0,4,9,14]
3418 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3419 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm4, %xmm4
3420 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4
3421 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255]
3422 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3
3423 ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm12, %ymm4
3424 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3
3425 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3426 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm6[4,9,14]
3427 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3428 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3429 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5
3430 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm7
3431 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
3432 ; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm7, %xmm5, %xmm5
3433 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
3434 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3435 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm11[2,7,12,u,u,u,u,u,u,u,u,u,u]
3436 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,1,6,11],zero,zero,zero,zero,xmm14[u,u,u]
3437 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5,6,7]
3438 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
3439 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u],zero,zero,zero,xmm8[0,5,10,15,u,u,u]
3440 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4,5,6,7]
3441 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm5, %xmm3
3442 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u]
3443 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
3444 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
3445 ; AVX1-ONLY-NEXT: vpor %xmm5, %xmm7, %xmm5
3446 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u>
3447 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm5, %xmm5
3448 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3449 ; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm14, %xmm2
3450 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm5, %xmm2
3451 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1
3452 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
3453 ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm2
3454 ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm0
3455 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0
3456 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3457 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm9[4,9,14]
3458 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3459 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3460 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm2, %xmm1
3461 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm2
3462 ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
3463 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm2, %xmm1, %xmm1
3464 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
3465 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3466 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm0 = [0,1,6,11,128,128,128,128,0,1,6,11,128,128,128,128]
3467 ; AVX1-ONLY-NEXT: # xmm0 = mem[0,0]
3468 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2
3469 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm1 = [0,128,128,128,0,5,10,15,0,128,128,128,0,5,10,15]
3470 ; AVX1-ONLY-NEXT: # xmm1 = mem[0,0]
3471 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm6, %xmm3
3472 ; AVX1-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3473 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3474 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u,u,u,u,u]
3475 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [128,0,0,3,8,13,128,128,128,0,0,3,8,13,128,128]
3476 ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
3477 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3478 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm6
3479 ; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
3480 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
3481 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm15[4,9,14],zero,zero,zero,zero,zero,zero,xmm15[u,u,u,u,u,u,u]
3482 ; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4
3483 ; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
3484 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
3485 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
3486 ; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
3487 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3488 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12
3489 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
3490 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7]
3491 ; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
3492 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
3493 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13
3494 ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
3495 ; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
3496 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3497 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15
3498 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
3499 ; AVX1-ONLY-NEXT: vpor %xmm10, %xmm13, %xmm10
3500 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm4, %xmm2, %xmm2
3501 ; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm13 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
3502 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3503 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm15, %xmm15
3504 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm15, %ymm4
3505 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
3506 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
3507 ; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm0
3508 ; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm9, %xmm1
3509 ; AVX1-ONLY-NEXT: vpor %xmm0, %xmm1, %xmm0
3510 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3511 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[2,7,12,u,u,u,u,u,u,u]
3512 ; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3
3513 ; AVX1-ONLY-NEXT: vpor %xmm1, %xmm3, %xmm1
3514 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,4,5,6,7,8,u,u,u,u,u,u,u]
3515 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[4,9,14],zero,zero,zero,zero,zero,zero,xmm14[u,u,u,u,u,u,u]
3516 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm1, %xmm1
3517 ; AVX1-ONLY-NEXT: vpblendvb %xmm8, %xmm1, %xmm0, %xmm0
3518 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3519 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
3520 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3521 ; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
3522 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5],xmm4[6,7]
3523 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3524 ; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm4, %xmm4
3525 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3526 ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
3527 ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4,5],xmm5[6,7]
3528 ; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
3529 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3530 ; AVX1-ONLY-NEXT: vpshufb %xmm13, %xmm4, %xmm4
3531 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
3532 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
3533 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
3534 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3535 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rsi)
3536 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3537 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rsi)
3538 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3539 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rdx)
3540 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3541 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rdx)
3542 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3543 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%rcx)
3544 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3545 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%rcx)
3546 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3547 ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8)
3548 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3549 ; AVX1-ONLY-NEXT: vmovaps %ymm1, (%r8)
3550 ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%r9)
3551 ; AVX1-ONLY-NEXT: vmovaps %ymm2, (%r9)
3552 ; AVX1-ONLY-NEXT: addq $488, %rsp # imm = 0x1E8
3553 ; AVX1-ONLY-NEXT: vzeroupper
3554 ; AVX1-ONLY-NEXT: retq
3556 ; AVX2-ONLY-LABEL: load_i8_stride5_vf64:
3557 ; AVX2-ONLY: # %bb.0:
3558 ; AVX2-ONLY-NEXT: subq $136, %rsp
3559 ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm2
3560 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm4
3561 ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm10
3562 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm9
3563 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
3564 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
3565 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3566 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
3567 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
3568 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm15
3569 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
3570 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3571 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
3572 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
3573 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
3574 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5
3575 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3576 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
3577 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
3578 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
3579 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3580 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
3581 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3582 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
3583 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
3584 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
3585 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5
3586 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3587 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
3588 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
3589 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
3590 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3591 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
3592 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3593 ; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3594 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3595 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
3596 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3597 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
3598 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
3599 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5
3600 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3601 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
3602 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
3603 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
3604 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3605 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
3606 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
3607 ; AVX2-ONLY-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
3608 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3609 ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm13
3610 ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm14
3611 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
3612 ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
3613 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6
3614 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
3615 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u>
3616 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
3617 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u>
3618 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3619 ; AVX2-ONLY-NEXT: vpor %xmm1, %xmm0, %xmm1
3620 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
3621 ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1]
3622 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm15, %ymm15
3623 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0]
3624 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm1, %ymm15, %ymm0
3625 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3626 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm7
3627 ; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm3
3628 ; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm1
3629 ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0
3630 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm8
3631 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm8, %xmm5
3632 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3633 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0
3634 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
3635 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
3636 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u>
3637 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm5
3638 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0
3639 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u>
3640 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
3641 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0
3642 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
3643 ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1]
3644 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
3645 ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm8, %ymm8
3646 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm8, %ymm8
3647 ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm11, %ymm0
3648 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm5
3649 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm6
3650 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5
3651 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4
3652 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm4, %xmm4
3653 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
3654 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
3655 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
3656 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm4
3657 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u>
3658 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm4
3659 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u>
3660 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
3661 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm0, %xmm0
3662 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
3663 ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1,0,1]
3664 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3665 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm12
3666 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
3667 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3668 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3669 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm0, %ymm0
3670 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm4
3671 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm4, %xmm12
3672 ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm12, %xmm5
3673 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
3674 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm4, %xmm4
3675 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
3676 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3677 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
3678 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
3679 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u>
3680 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm5
3681 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm0
3682 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u>
3683 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm0
3684 ; AVX2-ONLY-NEXT: vpor %xmm5, %xmm0, %xmm0
3685 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
3686 ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1]
3687 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
3688 ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm12
3689 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm12, %ymm0
3690 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3691 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
3692 ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0
3693 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm1, %ymm3, %ymm5
3694 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm4
3695 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm5, %xmm5
3696 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5
3697 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm5, %xmm4
3698 ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
3699 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
3700 ; AVX2-ONLY-NEXT: vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
3701 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3702 ; AVX2-ONLY-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
3703 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
3704 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255]
3705 ; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1]
3706 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm10
3707 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
3708 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
3709 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3710 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
3711 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
3712 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
3713 ; AVX2-ONLY-NEXT: vmovdqa 304(%rdi), %xmm2
3714 ; AVX2-ONLY-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
3715 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm0
3716 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
3717 ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %xmm1
3718 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm1, %xmm3
3719 ; AVX2-ONLY-NEXT: vpor %xmm0, %xmm3, %xmm0
3720 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
3721 ; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
3722 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
3723 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
3724 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3725 ; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm3
3726 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm4
3727 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm5
3728 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm12
3729 ; AVX2-ONLY-NEXT: vpor %xmm4, %xmm12, %xmm4
3730 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3731 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5,6,7],ymm15[8,9,10,11,12],ymm4[13,14,15]
3732 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm4[4,5,6,7]
3733 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3734 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,0,128,128,128,2,7,12,0,0,128,128,128,2,7,12]
3735 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm7
3736 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm14 = [0,0,3,8,13,128,128,128,0,0,3,8,13,128,128,128]
3737 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm1, %xmm15
3738 ; AVX2-ONLY-NEXT: vpor %xmm7, %xmm15, %xmm7
3739 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3740 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
3741 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3742 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3743 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm8
3744 ; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm5, %xmm12
3745 ; AVX2-ONLY-NEXT: vpor %xmm8, %xmm12, %xmm8
3746 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3747 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
3748 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
3749 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm12 = [0,128,128,128,128,4,9,14,0,128,128,128,128,4,9,14]
3750 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm11
3751 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm15 = [0,0,5,10,15,128,128,128,0,0,5,10,15,128,128,128]
3752 ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm1, %xmm14
3753 ; AVX2-ONLY-NEXT: vpor %xmm11, %xmm14, %xmm11
3754 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3755 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
3756 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
3757 ; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm12
3758 ; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm15
3759 ; AVX2-ONLY-NEXT: vpor %xmm12, %xmm15, %xmm12
3760 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3761 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
3762 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm13, %xmm15
3763 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u>
3764 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15
3765 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u>
3766 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm13, %xmm13
3767 ; AVX2-ONLY-NEXT: vpor %xmm15, %xmm13, %xmm13
3768 ; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
3769 ; AVX2-ONLY-NEXT: # ymm15 = mem[0,1,0,1]
3770 ; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm10, %ymm10
3771 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7]
3772 ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm13
3773 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15,0,1,6,11,0,5,10,15]
3774 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm13, %ymm13
3775 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,5,0,5,0,5,0,5]
3776 ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm0, %ymm13
3777 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
3778 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
3779 ; AVX2-ONLY-NEXT: vpshufb %ymm15, %ymm13, %ymm13
3780 ; AVX2-ONLY-NEXT: vextracti128 $1, %ymm9, %xmm15
3781 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm6
3782 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm9, %xmm7
3783 ; AVX2-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
3784 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7]
3785 ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm7
3786 ; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4
3787 ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0
3788 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0
3789 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,3,8,13,0,0,128,128,128,3,8,13]
3790 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm2
3791 ; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm6 = [0,0,4,9,14,128,128,128,0,0,4,9,14,128,128,128]
3792 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm1
3793 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm1, %xmm1
3794 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3795 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3796 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
3797 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
3798 ; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm2
3799 ; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm3
3800 ; AVX2-ONLY-NEXT: vpor %xmm2, %xmm3, %xmm2
3801 ; AVX2-ONLY-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3802 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3803 ; AVX2-ONLY-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
3804 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3805 ; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
3806 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi)
3807 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3808 ; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi)
3809 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
3810 ; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx)
3811 ; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx)
3812 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, 32(%rcx)
3813 ; AVX2-ONLY-NEXT: vmovdqa %ymm2, (%rcx)
3814 ; AVX2-ONLY-NEXT: vmovdqa %ymm11, 32(%r8)
3815 ; AVX2-ONLY-NEXT: vmovdqa %ymm12, (%r8)
3816 ; AVX2-ONLY-NEXT: vmovdqa %ymm10, 32(%r9)
3817 ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r9)
3818 ; AVX2-ONLY-NEXT: addq $136, %rsp
3819 ; AVX2-ONLY-NEXT: vzeroupper
3820 ; AVX2-ONLY-NEXT: retq
3822 ; AVX512F-LABEL: load_i8_stride5_vf64:
3824 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535]
3825 ; AVX512F-NEXT: vmovdqa64 (%rdi), %ymm23
3826 ; AVX512F-NEXT: vmovdqa64 32(%rdi), %ymm24
3827 ; AVX512F-NEXT: vmovdqa64 64(%rdi), %ymm21
3828 ; AVX512F-NEXT: vmovdqa64 96(%rdi), %ymm22
3829 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm5
3830 ; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm5
3831 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm6
3832 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
3833 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
3834 ; AVX512F-NEXT: vpor %xmm6, %xmm5, %xmm6
3835 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535]
3836 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm7
3837 ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm7
3838 ; AVX512F-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
3839 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm7, %ymm8
3840 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,6,11,16,21,26,31,20,25,30,19,24,29],zero,zero,zero,zero,zero,zero
3841 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
3842 ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm6, %ymm9
3843 ; AVX512F-NEXT: vmovdqa 192(%rdi), %ymm6
3844 ; AVX512F-NEXT: vmovdqa 224(%rdi), %ymm7
3845 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm10
3846 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm10
3847 ; AVX512F-NEXT: vmovdqa 208(%rdi), %xmm8
3848 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm10
3849 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,ymm10[3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
3850 ; AVX512F-NEXT: vmovdqa 176(%rdi), %xmm10
3851 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
3852 ; AVX512F-NEXT: vmovdqa 160(%rdi), %xmm11
3853 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
3854 ; AVX512F-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
3855 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3856 ; AVX512F-NEXT: vpternlogq $186, %ymm12, %ymm16, %ymm1
3857 ; AVX512F-NEXT: vmovdqa 144(%rdi), %xmm13
3858 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[1,6,11]
3859 ; AVX512F-NEXT: vmovdqa 128(%rdi), %xmm14
3860 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
3861 ; AVX512F-NEXT: vpor %xmm12, %xmm15, %xmm12
3862 ; AVX512F-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
3863 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm12, %zmm17
3864 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
3865 ; AVX512F-NEXT: vpternlogq $184, %zmm9, %zmm20, %zmm17
3866 ; AVX512F-NEXT: vmovdqa 256(%rdi), %ymm15
3867 ; AVX512F-NEXT: vmovdqa 288(%rdi), %ymm12
3868 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm9
3869 ; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm9
3870 ; AVX512F-NEXT: vextracti128 $1, %ymm9, %xmm2
3871 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u],zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11]
3872 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,4,9,14],zero,zero,zero,xmm9[2,7,12],zero,zero,zero
3873 ; AVX512F-NEXT: vpor %xmm2, %xmm9, %xmm2
3874 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3875 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
3876 ; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm17, %zmm19
3877 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
3878 ; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1
3879 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
3880 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12]
3881 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,0,5,10,15],zero,zero,zero,xmm1[3,8,13],zero,zero,zero
3882 ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
3883 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3884 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2
3885 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm2
3886 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
3887 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,ymm2[4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
3888 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[1,6,11],zero,zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u]
3889 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[0,5,10,15,u,u,u,u,u,u,u,u,u]
3890 ; AVX512F-NEXT: vpor %xmm3, %xmm9, %xmm3
3891 ; AVX512F-NEXT: vpternlogq $186, %ymm2, %ymm16, %ymm3
3892 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} ymm16 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
3893 ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm3
3894 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1
3895 ; AVX512F-NEXT: vpternlogq $202, %ymm24, %ymm23, %ymm1
3896 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero,xmm1[u,u,u]
3897 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
3898 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13,u,u,u]
3899 ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
3900 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,0,65535,0,65535,65535,0,65535,0,65535,65535,0,65535,0,65535,65535]
3901 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm2
3902 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm2
3903 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
3904 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm4
3905 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,7,12,17,22,27,16,21,26,31,20,25,30],zero,zero,zero,zero,zero,zero
3906 ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm2
3907 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[2,7,12]
3908 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
3909 ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1
3910 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3911 ; AVX512F-NEXT: vpternlogq $184, %zmm2, %zmm20, %zmm1
3912 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm17
3913 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm1
3914 ; AVX512F-NEXT: vpternlogq $202, %ymm12, %ymm15, %ymm1
3915 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,1,6,11],zero,zero,zero,zero,xmm1[4,9,14],zero,zero,zero
3916 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
3917 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[3,8,13]
3918 ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
3919 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3920 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm2
3921 ; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2
3922 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
3923 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
3924 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[1,6,11,u,u,u,u,u,u,u,u,u,u]
3925 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
3926 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
3927 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
3928 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3929 ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm2
3930 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm1
3931 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm1
3932 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm3
3933 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[1,6,11],zero,zero,zero,zero,xmm3[4,9,14,u,u,u]
3934 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero,xmm1[u,u,u]
3935 ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1
3936 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm3
3937 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3
3938 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
3939 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4
3940 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[3,8,13,18,23,28,17,22,27,16,21,26,31],zero,zero,zero,zero,zero,zero
3941 ; AVX512F-NEXT: vpternlogq $248, %ymm18, %ymm1, %ymm3
3942 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[3,8,13]
3943 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
3944 ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1
3945 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3946 ; AVX512F-NEXT: vpternlogq $184, %zmm3, %zmm20, %zmm1
3947 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm18
3948 ; AVX512F-NEXT: vmovdqa %ymm9, %ymm1
3949 ; AVX512F-NEXT: vpternlogq $202, %ymm15, %ymm12, %ymm1
3950 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
3951 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11],zero,zero,zero,zero,xmm2[4,9,14]
3952 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[0,5,10,15],zero,zero,zero
3953 ; AVX512F-NEXT: vpor %xmm2, %xmm1, %xmm1
3954 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3955 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm2
3956 ; AVX512F-NEXT: vpternlogq $202, %ymm7, %ymm6, %ymm2
3957 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm2
3958 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
3959 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm10[2,7,12,u,u,u,u,u,u,u,u,u,u]
3960 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[3,8,13],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
3961 ; AVX512F-NEXT: vpor %xmm3, %xmm4, %xmm3
3962 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
3963 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3964 ; AVX512F-NEXT: vpternlogq $226, %ymm1, %ymm16, %ymm2
3965 ; AVX512F-NEXT: vmovdqa %ymm0, %ymm1
3966 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm1
3967 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[u,u,u]
3968 ; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
3969 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[2,7,12],zero,zero,zero,xmm1[0,5,10,15,u,u,u]
3970 ; AVX512F-NEXT: vpor %xmm3, %xmm1, %xmm1
3971 ; AVX512F-NEXT: vmovdqa %ymm5, %ymm3
3972 ; AVX512F-NEXT: vpternlogq $202, %ymm21, %ymm22, %ymm3
3973 ; AVX512F-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
3974 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4
3975 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
3976 ; AVX512F-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
3977 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm13[4,9,14]
3978 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm4 = xmm14[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
3979 ; AVX512F-NEXT: vpor %xmm1, %xmm4, %xmm1
3980 ; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3981 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
3982 ; AVX512F-NEXT: vpternlogq $226, %zmm3, %zmm4, %zmm1
3983 ; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
3984 ; AVX512F-NEXT: vpternlogq $226, %ymm15, %ymm0, %ymm12
3985 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
3986 ; AVX512F-NEXT: vextracti128 $1, %ymm12, %xmm3
3987 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
3988 ; AVX512F-NEXT: vpor %xmm2, %xmm3, %xmm2
3989 ; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3990 ; AVX512F-NEXT: vpternlogq $202, %ymm6, %ymm7, %ymm9
3991 ; AVX512F-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm8, %ymm9
3992 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
3993 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[3,8,13,u,u,u,u,u,u,u,u,u,u]
3994 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[4,9,14],zero,zero,zero,xmm11[u,u,u,u,u,u,u,u,u,u]
3995 ; AVX512F-NEXT: vpor %xmm6, %xmm7, %xmm6
3996 ; AVX512F-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm3[3,4,5,6,7]
3997 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
3998 ; AVX512F-NEXT: vpternlogq $226, %ymm2, %ymm16, %ymm3
3999 ; AVX512F-NEXT: vpternlogq $202, %ymm23, %ymm24, %ymm5
4000 ; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm2
4001 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[3,8,13],zero,zero,zero,xmm2[1,6,11,u,u,u,u]
4002 ; AVX512F-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,9,14],zero,zero,zero,xmm5[2,7,12],zero,zero,zero,xmm5[u,u,u,u]
4003 ; AVX512F-NEXT: vpor %xmm2, %xmm5, %xmm2
4004 ; AVX512F-NEXT: vpternlogq $202, %ymm22, %ymm21, %ymm0
4005 ; AVX512F-NEXT: vpermq {{.*#+}} ymm5 = ymm0[2,3,0,1]
4006 ; AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm5
4007 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
4008 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
4009 ; AVX512F-NEXT: vmovdqa 128(%rdi), %ymm2
4010 ; AVX512F-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
4011 ; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,5,0,5,0,5,0,5]
4012 ; AVX512F-NEXT: vpermd %ymm2, %ymm5, %ymm2
4013 ; AVX512F-NEXT: vpternlogq $226, %zmm0, %zmm4, %zmm2
4014 ; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
4015 ; AVX512F-NEXT: vmovdqa64 %zmm19, (%rsi)
4016 ; AVX512F-NEXT: vmovdqa64 %zmm17, (%rdx)
4017 ; AVX512F-NEXT: vmovdqa64 %zmm18, (%rcx)
4018 ; AVX512F-NEXT: vmovdqa64 %zmm1, (%r8)
4019 ; AVX512F-NEXT: vmovdqa64 %zmm0, (%r9)
4020 ; AVX512F-NEXT: vzeroupper
4021 ; AVX512F-NEXT: retq
4023 ; AVX512BW-LABEL: load_i8_stride5_vf64:
4024 ; AVX512BW: # %bb.0:
4025 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
4026 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
4027 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm0
4028 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm1
4029 ; AVX512BW-NEXT: movw $21140, %ax # imm = 0x5294
4030 ; AVX512BW-NEXT: kmovd %eax, %k2
4031 ; AVX512BW-NEXT: vpblendmw %ymm1, %ymm0, %ymm4 {%k2}
4032 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
4033 ; AVX512BW-NEXT: movl $1108344832, %eax # imm = 0x42100000
4034 ; AVX512BW-NEXT: kmovd %eax, %k1
4035 ; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm4 {%k1}
4036 ; AVX512BW-NEXT: movw $19026, %ax # imm = 0x4A52
4037 ; AVX512BW-NEXT: kmovd %eax, %k1
4038 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k1}
4039 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm6
4040 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
4041 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
4042 ; AVX512BW-NEXT: vpor %xmm6, %xmm5, %xmm10
4043 ; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000
4044 ; AVX512BW-NEXT: kmovd %eax, %k5
4045 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
4046 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm5
4047 ; AVX512BW-NEXT: vmovdqa 224(%rdi), %ymm4
4048 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
4049 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
4050 ; AVX512BW-NEXT: movl $4228, %eax # imm = 0x1084
4051 ; AVX512BW-NEXT: kmovd %eax, %k3
4052 ; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k3}
4053 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,3,8,13,2,7,12,1,6,11,16,21,26,31,u,u,u,u,u,u,u,u,u,u,u,u]
4054 ; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm6
4055 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,u,u,u,u,u,u,4,9,14,u,u,u,u,u]
4056 ; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm7
4057 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,0,5,10,15,u,u,u,u]
4058 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4059 ; AVX512BW-NEXT: movl $127, %eax
4060 ; AVX512BW-NEXT: kmovd %eax, %k4
4061 ; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm8 {%k4}
4062 ; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm12
4063 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[1,6,11]
4064 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm13
4065 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
4066 ; AVX512BW-NEXT: vpor %xmm9, %xmm11, %xmm9
4067 ; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
4068 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
4069 ; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm10 {%k5}
4070 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
4071 ; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm9
4072 ; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm8
4073 ; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm14 {%k2}
4074 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
4075 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u],zero,zero,zero,xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11]
4076 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,9,14],zero,zero,zero,xmm14[2,7,12],zero,zero,zero
4077 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
4078 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
4079 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
4080 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm18
4081 ; AVX512BW-NEXT: movw $10570, %ax # imm = 0x294A
4082 ; AVX512BW-NEXT: kmovd %eax, %k3
4083 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm11 {%k3}
4084 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
4085 ; AVX512BW-NEXT: movl $-2078212096, %eax # imm = 0x84210000
4086 ; AVX512BW-NEXT: kmovd %eax, %k6
4087 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm11 {%k6}
4088 ; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k2}
4089 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero,xmm14[u,u,u]
4090 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
4091 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13,u,u,u]
4092 ; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
4093 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
4094 ; AVX512BW-NEXT: vpblendmw %ymm5, %ymm4, %ymm11 {%k2}
4095 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm11[2,3,0,1]
4096 ; AVX512BW-NEXT: movl $8456, %eax # imm = 0x2108
4097 ; AVX512BW-NEXT: kmovd %eax, %k6
4098 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k6}
4099 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
4100 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u]
4101 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[0,5,10,15,u,u,u,u,u,u,u,u,u]
4102 ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
4103 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
4104 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[2,7,12]
4105 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
4106 ; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
4107 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
4108 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm15, %zmm11
4109 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm14 {%k5}
4110 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm14, %ymm11
4111 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm15 {%k1}
4112 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
4113 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,zero,zero,xmm16[4,9,14],zero,zero,zero,xmm16[2,7,12]
4114 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,0,5,10,15],zero,zero,zero,xmm15[3,8,13],zero,zero,zero
4115 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
4116 ; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
4117 ; AVX512BW-NEXT: movl $-524288, %eax # imm = 0xFFF80000
4118 ; AVX512BW-NEXT: kmovd %eax, %k4
4119 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm11 {%k4}
4120 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm19
4121 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm14 {%k1}
4122 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
4123 ; AVX512BW-NEXT: movl $138543104, %eax # imm = 0x8420000
4124 ; AVX512BW-NEXT: kmovd %eax, %k6
4125 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm14 {%k6}
4126 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k3}
4127 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm15, %xmm16
4128 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm16[1,6,11],zero,zero,zero,zero,xmm16[4,9,14,u,u,u]
4129 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15],zero,zero,zero,xmm15[u,u,u]
4130 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
4131 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
4132 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1}
4133 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm14[2,3,0,1]
4134 ; AVX512BW-NEXT: movl $16912, %eax # imm = 0x4210
4135 ; AVX512BW-NEXT: kmovd %eax, %k6
4136 ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm14 {%k6}
4137 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
4138 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[1,6,11,u,u,u,u,u,u,u,u,u,u]
4139 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[2,7,12],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
4140 ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm10
4141 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm14[3,4,5,6,7]
4142 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm14[4,5,6,7]
4143 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm12[3,8,13]
4144 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
4145 ; AVX512BW-NEXT: vporq %xmm14, %xmm16, %xmm14
4146 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
4147 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm10
4148 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm15 {%k5}
4149 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm10
4150 ; AVX512BW-NEXT: vpblendmw %ymm8, %ymm9, %ymm14 {%k2}
4151 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm14[u,u,u,1,6,11],zero,zero,zero,zero,xmm14[4,9,14],zero,zero,zero
4152 ; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
4153 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,zero,xmm14[0,5,10,15],zero,zero,zero,xmm14[3,8,13]
4154 ; AVX512BW-NEXT: vporq %xmm16, %xmm14, %xmm14
4155 ; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
4156 ; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm10 {%k4}
4157 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm15, %zmm14
4158 ; AVX512BW-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k2}
4159 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm10[2,3,0,1]
4160 ; AVX512BW-NEXT: movl $277086208, %eax # imm = 0x10840000
4161 ; AVX512BW-NEXT: kmovd %eax, %k5
4162 ; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm10 {%k5}
4163 ; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm15 {%k1}
4164 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[3,8,13],zero,zero,zero,xmm15[1,6,11],zero,zero,zero,zero,xmm15[u,u,u]
4165 ; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm15
4166 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[2,7,12],zero,zero,zero,xmm15[0,5,10,15,u,u,u]
4167 ; AVX512BW-NEXT: vporq %xmm16, %xmm15, %xmm15
4168 ; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000
4169 ; AVX512BW-NEXT: kmovd %eax, %k5
4170 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
4171 ; AVX512BW-NEXT: vpblendmw %ymm4, %ymm5, %ymm10 {%k2}
4172 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm10[2,3,0,1]
4173 ; AVX512BW-NEXT: movl $33825, %eax # imm = 0x8421
4174 ; AVX512BW-NEXT: kmovd %eax, %k5
4175 ; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm10 {%k5}
4176 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
4177 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = zero,zero,zero,xmm6[2,7,12,u,u,u,u,u,u,u,u,u,u]
4178 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm17 = xmm7[3,8,13],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
4179 ; AVX512BW-NEXT: vporq %xmm16, %xmm17, %xmm11
4180 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
4181 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
4182 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm12[4,9,14]
4183 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
4184 ; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
4185 ; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
4186 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
4187 ; AVX512BW-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
4188 ; AVX512BW-NEXT: kmovq %rax, %k5
4189 ; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm10 {%k5}
4190 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm10, %ymm11
4191 ; AVX512BW-NEXT: vpblendmw %ymm9, %ymm8, %ymm12 {%k3}
4192 ; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13
4193 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,zero,xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14]
4194 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,2,7,12],zero,zero,zero,xmm12[0,5,10,15],zero,zero,zero
4195 ; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12
4196 ; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
4197 ; AVX512BW-NEXT: vmovdqu8 %ymm12, %ymm11 {%k4}
4198 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm10
4199 ; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
4200 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
4201 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[3,8,13],zero,zero,zero,xmm3[1,6,11,u,u,u,u]
4202 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,9,14],zero,zero,zero,xmm2[2,7,12],zero,zero,zero,xmm2[u,u,u,u]
4203 ; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
4204 ; AVX512BW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
4205 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
4206 ; AVX512BW-NEXT: movl $554172416, %eax # imm = 0x21080000
4207 ; AVX512BW-NEXT: kmovd %eax, %k2
4208 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k2}
4209 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u]
4210 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
4211 ; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k3}
4212 ; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm4[2,3,0,1]
4213 ; AVX512BW-NEXT: movl $2114, %eax # imm = 0x842
4214 ; AVX512BW-NEXT: kmovd %eax, %k2
4215 ; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm4 {%k2}
4216 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
4217 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm6[3,8,13,u,u,u,u,u,u,u,u,u,u]
4218 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[4,9,14],zero,zero,zero,xmm7[u,u,u,u,u,u,u,u,u,u]
4219 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
4220 ; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
4221 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4222 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
4223 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u]
4224 ; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [0,5,0,5,0,5,0,5]
4225 ; AVX512BW-NEXT: vpermd %ymm2, %ymm3, %ymm2
4226 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
4227 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k5}
4228 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm0
4229 ; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm8 {%k1}
4230 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u,3,8,13],zero,zero,zero,xmm8[1,6,11],zero,zero,zero,zero
4231 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm3
4232 ; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
4233 ; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
4234 ; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4235 ; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm0 {%k4}
4236 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
4237 ; AVX512BW-NEXT: vmovdqa64 %zmm18, (%rsi)
4238 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
4239 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rcx)
4240 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%r8)
4241 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r9)
4242 ; AVX512BW-NEXT: vzeroupper
4243 ; AVX512BW-NEXT: retq
4244 %wide.vec = load <320 x i8>, ptr %in.vec, align 64
4245 %strided.vec0 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315>
4246 %strided.vec1 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316>
4247 %strided.vec2 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317>
4248 %strided.vec3 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318>
4249 %strided.vec4 = shufflevector <320 x i8> %wide.vec, <320 x i8> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319>
4250 store <64 x i8> %strided.vec0, ptr %out.vec0, align 64
4251 store <64 x i8> %strided.vec1, ptr %out.vec1, align 64
4252 store <64 x i8> %strided.vec2, ptr %out.vec2, align 64
4253 store <64 x i8> %strided.vec3, ptr %out.vec3, align 64
4254 store <64 x i8> %strided.vec4, ptr %out.vec4, align 64
4257 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
4260 ; AVX2-FAST-PERLANE: {{.*}}
4263 ; AVX512-FAST: {{.*}}
4264 ; AVX512-SLOW: {{.*}}
4265 ; AVX512BW-FAST: {{.*}}
4266 ; AVX512BW-ONLY-FAST: {{.*}}
4267 ; AVX512BW-ONLY-SLOW: {{.*}}
4268 ; AVX512BW-SLOW: {{.*}}
4269 ; AVX512DQ-FAST: {{.*}}
4270 ; AVX512DQ-SLOW: {{.*}}
4271 ; AVX512DQBW-FAST: {{.*}}
4272 ; AVX512DQBW-SLOW: {{.*}}
4273 ; AVX512F-FAST: {{.*}}
4274 ; AVX512F-ONLY-FAST: {{.*}}
4275 ; AVX512F-ONLY-SLOW: {{.*}}
4276 ; AVX512F-SLOW: {{.*}}
4279 ; FALLBACK10: {{.*}}
4280 ; FALLBACK11: {{.*}}
4281 ; FALLBACK12: {{.*}}