1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved stores.
18 define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
19 ; SSE-LABEL: store_i16_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movdqa (%rdi), %xmm0
24 ; SSE-NEXT: movdqa (%rdx), %xmm1
25 ; SSE-NEXT: movdqa (%r8), %xmm2
26 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
27 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
28 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
29 ; SSE-NEXT: movdqa %xmm0, %xmm3
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
31 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
32 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,1,3,4,5,6,7]
33 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
34 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535]
35 ; SSE-NEXT: pand %xmm4, %xmm3
36 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
37 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,2,0]
38 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,7,7]
39 ; SSE-NEXT: pandn %xmm5, %xmm4
40 ; SSE-NEXT: por %xmm3, %xmm4
41 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
43 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,2,0,3,4,5,6,7]
44 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
45 ; SSE-NEXT: pand %xmm1, %xmm0
46 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
47 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
48 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
49 ; SSE-NEXT: pandn %xmm2, %xmm1
50 ; SSE-NEXT: por %xmm0, %xmm1
51 ; SSE-NEXT: movq %xmm1, 16(%rax)
52 ; SSE-NEXT: movdqa %xmm4, (%rax)
53 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
54 ; SSE-NEXT: movd %xmm0, 24(%rax)
57 ; AVX-LABEL: store_i16_stride7_vf2:
59 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
60 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
61 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
62 ; AVX-NEXT: vmovdqa (%rdx), %xmm1
63 ; AVX-NEXT: vmovdqa (%r8), %xmm2
64 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
65 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
66 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
67 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
68 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0]
69 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,0,1,4,5,8,9,2,3]
70 ; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm3
71 ; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2
72 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6],xmm3[7]
73 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,10,11,14,15,2,3,6,7,10,11,12,13,14,15]
74 ; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
75 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
76 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5],xmm0[6,7]
77 ; AVX-NEXT: vpextrd $2, %xmm1, 24(%rax)
78 ; AVX-NEXT: vmovq %xmm0, 16(%rax)
79 ; AVX-NEXT: vmovdqa %xmm2, (%rax)
82 ; AVX2-LABEL: store_i16_stride7_vf2:
84 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
85 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
86 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
87 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1
88 ; AVX2-NEXT: vmovdqa (%rdx), %xmm2
89 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
90 ; AVX2-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
91 ; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
92 ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
93 ; AVX2-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
94 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
95 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
96 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
97 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21]
98 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
99 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
100 ; AVX2-NEXT: vpextrd $2, %xmm1, 24(%rax)
101 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
102 ; AVX2-NEXT: vmovq %xmm1, 16(%rax)
103 ; AVX2-NEXT: vmovdqa %xmm0, (%rax)
104 ; AVX2-NEXT: vzeroupper
107 ; AVX2-FP-LABEL: store_i16_stride7_vf2:
109 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
110 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
111 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
112 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
113 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
114 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
115 ; AVX2-FP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
116 ; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
117 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
118 ; AVX2-FP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
119 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
120 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
121 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
122 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21]
123 ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
124 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
125 ; AVX2-FP-NEXT: vpextrd $2, %xmm1, 24(%rax)
126 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
127 ; AVX2-FP-NEXT: vmovq %xmm1, 16(%rax)
128 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax)
129 ; AVX2-FP-NEXT: vzeroupper
132 ; AVX2-FCP-LABEL: store_i16_stride7_vf2:
134 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
135 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
136 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
137 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
138 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
139 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
140 ; AVX2-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
141 ; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
142 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
143 ; AVX2-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
144 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
145 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
146 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
147 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,ymm0[24,25,20,21]
148 ; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
149 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
150 ; AVX2-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax)
151 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
152 ; AVX2-FCP-NEXT: vmovq %xmm1, 16(%rax)
153 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax)
154 ; AVX2-FCP-NEXT: vzeroupper
155 ; AVX2-FCP-NEXT: retq
157 ; AVX512-LABEL: store_i16_stride7_vf2:
159 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
160 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
161 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
162 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
163 ; AVX512-NEXT: vmovdqa (%rdx), %xmm2
164 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
165 ; AVX512-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
166 ; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
167 ; AVX512-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
168 ; AVX512-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
169 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
170 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
171 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
172 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
173 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
174 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
175 ; AVX512-NEXT: vpextrd $2, %xmm1, 24(%rax)
176 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
177 ; AVX512-NEXT: vmovq %xmm1, 16(%rax)
178 ; AVX512-NEXT: vmovdqa %xmm0, (%rax)
179 ; AVX512-NEXT: vzeroupper
182 ; AVX512-FCP-LABEL: store_i16_stride7_vf2:
183 ; AVX512-FCP: # %bb.0:
184 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
185 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
186 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
187 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
188 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
189 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
190 ; AVX512-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
191 ; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
192 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
193 ; AVX512-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
194 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
195 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
196 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
197 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
198 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
199 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
200 ; AVX512-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax)
201 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
202 ; AVX512-FCP-NEXT: vmovq %xmm1, 16(%rax)
203 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
204 ; AVX512-FCP-NEXT: vzeroupper
205 ; AVX512-FCP-NEXT: retq
207 ; AVX512DQ-LABEL: store_i16_stride7_vf2:
209 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
210 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
211 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
212 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
213 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
214 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
215 ; AVX512DQ-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
216 ; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
217 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
218 ; AVX512DQ-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
219 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
220 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
221 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
222 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
223 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
224 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
225 ; AVX512DQ-NEXT: vpextrd $2, %xmm1, 24(%rax)
226 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
227 ; AVX512DQ-NEXT: vmovq %xmm1, 16(%rax)
228 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax)
229 ; AVX512DQ-NEXT: vzeroupper
230 ; AVX512DQ-NEXT: retq
232 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf2:
233 ; AVX512DQ-FCP: # %bb.0:
234 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
235 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
236 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
237 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
238 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
239 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
240 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
241 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
242 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
243 ; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
244 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
245 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27,u,u,u,u]
246 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
247 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,ymm0[22,23,26,27,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
248 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
249 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
250 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm1, 24(%rax)
251 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
252 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, 16(%rax)
253 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
254 ; AVX512DQ-FCP-NEXT: vzeroupper
255 ; AVX512DQ-FCP-NEXT: retq
257 ; AVX512BW-LABEL: store_i16_stride7_vf2:
259 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
260 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
261 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
262 ; AVX512BW-NEXT: vmovdqa (%rsi), %xmm1
263 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2
264 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
265 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
266 ; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
267 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
268 ; AVX512BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
269 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
270 ; AVX512BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
271 ; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm0
272 ; AVX512BW-NEXT: vpextrd $2, %xmm0, 24(%rax)
273 ; AVX512BW-NEXT: vmovq %xmm0, 16(%rax)
274 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rax)
275 ; AVX512BW-NEXT: vzeroupper
276 ; AVX512BW-NEXT: retq
278 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf2:
279 ; AVX512BW-FCP: # %bb.0:
280 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
281 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
282 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
283 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
284 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
285 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
286 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
287 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
288 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
289 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
290 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
291 ; AVX512BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
292 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
293 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax)
294 ; AVX512BW-FCP-NEXT: vmovq %xmm0, 16(%rax)
295 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rax)
296 ; AVX512BW-FCP-NEXT: vzeroupper
297 ; AVX512BW-FCP-NEXT: retq
299 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf2:
300 ; AVX512DQ-BW: # %bb.0:
301 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
302 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
303 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
304 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm1
305 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2
306 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
307 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
308 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
309 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
310 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
311 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
312 ; AVX512DQ-BW-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
313 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm0
314 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, 24(%rax)
315 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 16(%rax)
316 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rax)
317 ; AVX512DQ-BW-NEXT: vzeroupper
318 ; AVX512DQ-BW-NEXT: retq
320 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf2:
321 ; AVX512DQ-BW-FCP: # %bb.0:
322 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
323 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
324 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
325 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1
326 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm2
327 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
328 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm1, %ymm1
329 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
330 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
331 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm2, %ymm1
332 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,0,0]
333 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm1, %ymm0, %ymm2
334 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm0
335 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, 24(%rax)
336 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, 16(%rax)
337 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rax)
338 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
339 ; AVX512DQ-BW-FCP-NEXT: retq
340 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
341 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
342 %in.vec2 = load <2 x i16>, ptr %in.vecptr2, align 64
343 %in.vec3 = load <2 x i16>, ptr %in.vecptr3, align 64
344 %in.vec4 = load <2 x i16>, ptr %in.vecptr4, align 64
345 %in.vec5 = load <2 x i16>, ptr %in.vecptr5, align 64
346 %in.vec6 = load <2 x i16>, ptr %in.vecptr6, align 64
347 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
348 %2 = shufflevector <2 x i16> %in.vec2, <2 x i16> %in.vec3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
349 %3 = shufflevector <2 x i16> %in.vec4, <2 x i16> %in.vec5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350 %4 = shufflevector <4 x i16> %1, <4 x i16> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
351 %5 = shufflevector <2 x i16> %in.vec6, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
352 %6 = shufflevector <4 x i16> %3, <4 x i16> %5, <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
353 %7 = shufflevector <6 x i16> %6, <6 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 undef, i32 undef>
354 %8 = shufflevector <8 x i16> %4, <8 x i16> %7, <14 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13>
355 %interleaved.vec = shufflevector <14 x i16> %8, <14 x i16> poison, <14 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13>
356 store <14 x i16> %interleaved.vec, ptr %out.vec, align 64
360 define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
361 ; SSE-LABEL: store_i16_stride7_vf4:
363 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
364 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
365 ; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
366 ; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
367 ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
368 ; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero
369 ; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
370 ; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero
371 ; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
372 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm6[0]
373 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,1,0,1]
374 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,65535,65535]
375 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,3]
376 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
377 ; SSE-NEXT: pand %xmm6, %xmm8
378 ; SSE-NEXT: pandn %xmm7, %xmm6
379 ; SSE-NEXT: por %xmm8, %xmm6
380 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm4[0,1,1,3,4,5,6,7]
381 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm3[0]
382 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,65535,0,65535]
383 ; SSE-NEXT: movdqa %xmm5, %xmm10
384 ; SSE-NEXT: movdqa %xmm3, %xmm9
385 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7]
386 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
387 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
388 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
389 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,4,4,4]
390 ; SSE-NEXT: pand %xmm7, %xmm12
391 ; SSE-NEXT: pandn %xmm8, %xmm7
392 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535]
393 ; SSE-NEXT: por %xmm12, %xmm7
394 ; SSE-NEXT: pand %xmm5, %xmm7
395 ; SSE-NEXT: pandn %xmm6, %xmm5
396 ; SSE-NEXT: por %xmm7, %xmm5
397 ; SSE-NEXT: psrld $16, %xmm10
398 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
399 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,0,0,65535]
400 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
401 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6,6]
402 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
403 ; SSE-NEXT: pand %xmm7, %xmm6
404 ; SSE-NEXT: pandn %xmm9, %xmm7
405 ; SSE-NEXT: por %xmm6, %xmm7
406 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535]
407 ; SSE-NEXT: pand %xmm6, %xmm7
408 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[3,1,2,3]
409 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7]
410 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
411 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
412 ; SSE-NEXT: pandn %xmm9, %xmm6
413 ; SSE-NEXT: por %xmm7, %xmm6
414 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
415 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
416 ; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8
417 ; SSE-NEXT: psrlq $48, %xmm4
418 ; SSE-NEXT: por %xmm8, %xmm4
419 ; SSE-NEXT: pand %xmm7, %xmm4
420 ; SSE-NEXT: pandn %xmm1, %xmm7
421 ; SSE-NEXT: por %xmm4, %xmm7
422 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
423 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,1]
424 ; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,65535,0,0,0,65535]
425 ; SSE-NEXT: andps %xmm3, %xmm0
426 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
427 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
428 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
429 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
430 ; SSE-NEXT: andnps %xmm2, %xmm3
431 ; SSE-NEXT: orps %xmm0, %xmm3
432 ; SSE-NEXT: movaps %xmm3, (%rax)
433 ; SSE-NEXT: movq %xmm7, 48(%rax)
434 ; SSE-NEXT: movdqa %xmm6, 32(%rax)
435 ; SSE-NEXT: movdqa %xmm5, 16(%rax)
438 ; AVX-LABEL: store_i16_stride7_vf4:
440 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
441 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
442 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
443 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
444 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
445 ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
446 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
447 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
448 ; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
449 ; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
450 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
451 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
452 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
453 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1]
454 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7]
455 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7]
456 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
457 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
458 ; AVX-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
459 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1]
460 ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
461 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7]
462 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
463 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7]
464 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
465 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
466 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3]
467 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
468 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7]
469 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7]
470 ; AVX-NEXT: vpxor %xmm7, %xmm7, %xmm7
471 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7]
472 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
473 ; AVX-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
474 ; AVX-NEXT: vpackusdw %xmm7, %xmm7, %xmm7
475 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7]
476 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
477 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
478 ; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
479 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
480 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7]
481 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7]
482 ; AVX-NEXT: vmovdqa %xmm0, 16(%rax)
483 ; AVX-NEXT: vmovdqa %xmm5, (%rax)
484 ; AVX-NEXT: vmovq %xmm4, 48(%rax)
485 ; AVX-NEXT: vmovdqa %xmm3, 32(%rax)
488 ; AVX2-LABEL: store_i16_stride7_vf4:
490 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
491 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
492 ; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
493 ; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
494 ; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
495 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
496 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
497 ; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
498 ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
499 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
500 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
501 ; AVX2-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
502 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
503 ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
504 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
505 ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
506 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
507 ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29]
508 ; AVX2-NEXT: vpor %ymm5, %ymm4, %ymm4
509 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
510 ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19]
511 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero
512 ; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5
513 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
514 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
515 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21]
516 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero
517 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
518 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31]
519 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1]
520 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
521 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0]
522 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
523 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0]
524 ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
525 ; AVX2-NEXT: vmovdqa %ymm4, (%rax)
526 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
527 ; AVX2-NEXT: vmovq %xmm1, 48(%rax)
528 ; AVX2-NEXT: vmovdqa %xmm0, 32(%rax)
529 ; AVX2-NEXT: vzeroupper
532 ; AVX2-FP-LABEL: store_i16_stride7_vf4:
534 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
535 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
536 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
537 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
538 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
539 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
540 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
541 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
542 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
543 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
544 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
545 ; AVX2-FP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
546 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
547 ; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
548 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
549 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
550 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
551 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm5[26,27],zero,zero,zero,zero,ymm5[24,25,20,21,22,23,20,21,28,29]
552 ; AVX2-FP-NEXT: vpor %ymm5, %ymm4, %ymm4
553 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
554 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[2,3,2,3,2,3,2,3],zero,zero,zero,zero,ymm0[0,1,2,3,18,19,18,19,18,19,18,19,26,27],zero,zero,ymm0[16,17,18,19]
555 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19],zero,zero,zero,zero
556 ; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5
557 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
558 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
559 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[12,13,6,7],zero,zero,zero,zero,ymm0[4,5,4,5,4,5,4,5,28,29,22,23,30,31],zero,zero,ymm0[20,21,20,21,20,21,20,21]
560 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[4,5,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[22,23],zero,zero,zero,zero,zero,zero,zero,zero
561 ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0
562 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31]
563 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7]
564 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,0,0,0,0,0,0,0,0]
565 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
566 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,0,0,0,0]
567 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
568 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax)
569 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
570 ; AVX2-FP-NEXT: vmovq %xmm1, 48(%rax)
571 ; AVX2-FP-NEXT: vmovdqa %xmm0, 32(%rax)
572 ; AVX2-FP-NEXT: vzeroupper
575 ; AVX2-FCP-LABEL: store_i16_stride7_vf4:
577 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
578 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
579 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
580 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
581 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
582 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
583 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
584 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
585 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
586 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
587 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
588 ; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
589 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
590 ; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
591 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
592 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
593 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
594 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
595 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
596 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
597 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm3
598 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
599 ; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
600 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3],zero,zero,ymm2[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
601 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
602 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29]
603 ; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
604 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
605 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
606 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
607 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u]
608 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
609 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
610 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
611 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
612 ; AVX2-FCP-NEXT: vmovq %xmm0, 48(%rax)
613 ; AVX2-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
614 ; AVX2-FCP-NEXT: vzeroupper
615 ; AVX2-FCP-NEXT: retq
617 ; AVX512-LABEL: store_i16_stride7_vf4:
619 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
620 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
621 ; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
622 ; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
623 ; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
624 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
625 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
626 ; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
627 ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
628 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
629 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
630 ; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
631 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
632 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
633 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
634 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
635 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u]
636 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u]
637 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
638 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u]
639 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u]
640 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1
641 ; AVX512-NEXT: vporq %zmm0, %zmm1, %zmm0
642 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1]
643 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29]
644 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
645 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
646 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
647 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
648 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
649 ; AVX512-NEXT: vporq %zmm1, %zmm2, %zmm1
650 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
651 ; AVX512-NEXT: vextracti32x4 $2, %zmm1, 32(%rax)
652 ; AVX512-NEXT: vextracti32x4 $3, %zmm1, %xmm0
653 ; AVX512-NEXT: vmovq %xmm0, 48(%rax)
654 ; AVX512-NEXT: vmovdqa %ymm1, (%rax)
655 ; AVX512-NEXT: vzeroupper
658 ; AVX512-FCP-LABEL: store_i16_stride7_vf4:
659 ; AVX512-FCP: # %bb.0:
660 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
661 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
662 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
663 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
664 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
665 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
666 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
667 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
668 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
669 ; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
670 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
671 ; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
672 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
673 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
674 ; AVX512-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
675 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
676 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
677 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
678 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
679 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
680 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
681 ; AVX512-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
682 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
683 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
684 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
685 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
686 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
687 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
688 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
689 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
690 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
691 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
692 ; AVX512-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
693 ; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
694 ; AVX512-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
695 ; AVX512-FCP-NEXT: vmovq %xmm1, 48(%rax)
696 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
697 ; AVX512-FCP-NEXT: vzeroupper
698 ; AVX512-FCP-NEXT: retq
700 ; AVX512DQ-LABEL: store_i16_stride7_vf4:
702 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
703 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
704 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
705 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
706 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
707 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
708 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
709 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
710 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
711 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
712 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
713 ; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
714 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
715 ; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
716 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,1,3]
717 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2]
718 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm0[0,1,u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm0[u,u,u,u]
719 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u],zero,zero,zero,zero,ymm0[4,5,u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm0[u,u,u,u,u,u,u,u]
720 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
721 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[18,19,u,u,u,u]
722 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,4,5,12,13],zero,zero,ymm1[u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[22,23,u,u,u,u,u,u,u,u]
723 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1
724 ; AVX512DQ-NEXT: vporq %zmm0, %zmm1, %zmm0
725 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[2,3,0,1]
726 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm1[26,27],zero,zero,zero,zero,ymm1[u,u,u,u,u,u,20,21,28,29]
727 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm2[u,u,u,u,u,u,6,7,14,15],zero,zero,ymm2[30,31,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
728 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
729 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
730 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[4,5,12,13,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[6,7],zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
731 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
732 ; AVX512DQ-NEXT: vporq %zmm1, %zmm2, %zmm1
733 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
734 ; AVX512DQ-NEXT: vextracti32x4 $2, %zmm1, 32(%rax)
735 ; AVX512DQ-NEXT: vextracti32x4 $3, %zmm1, %xmm0
736 ; AVX512DQ-NEXT: vmovq %xmm0, 48(%rax)
737 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax)
738 ; AVX512DQ-NEXT: vzeroupper
739 ; AVX512DQ-NEXT: retq
741 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf4:
742 ; AVX512DQ-FCP: # %bb.0:
743 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
744 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
745 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
746 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
747 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
748 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
749 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
750 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
751 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
752 ; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
753 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
754 ; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
755 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
756 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
757 ; AVX512DQ-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
758 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
759 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
760 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm0
761 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[18,19,22,23,26,27],zero,zero,zero,zero
762 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
763 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
764 ; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
765 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero
766 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
767 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[0,1,8,9],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,2,3],zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
768 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
769 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
770 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = mem & (ymm3 | ymm1)
771 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,7,1,3,7,0,0,0]
772 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
773 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
774 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
775 ; AVX512DQ-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0
776 ; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
777 ; AVX512DQ-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
778 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, 48(%rax)
779 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
780 ; AVX512DQ-FCP-NEXT: vzeroupper
781 ; AVX512DQ-FCP-NEXT: retq
783 ; AVX512BW-LABEL: store_i16_stride7_vf4:
785 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
786 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
787 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
788 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
789 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
790 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
791 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
792 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
793 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
794 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
795 ; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
796 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
797 ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
798 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
799 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0]
800 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1
801 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax)
802 ; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0
803 ; AVX512BW-NEXT: vmovq %xmm0, 48(%rax)
804 ; AVX512BW-NEXT: vmovdqa %ymm1, (%rax)
805 ; AVX512BW-NEXT: vzeroupper
806 ; AVX512BW-NEXT: retq
808 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf4:
809 ; AVX512BW-FCP: # %bb.0:
810 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
811 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
812 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
813 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
814 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
815 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
816 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
817 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
818 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
819 ; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
820 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
821 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
822 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
823 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
824 ; AVX512BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
825 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
826 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
827 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
828 ; AVX512BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
829 ; AVX512BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
830 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
831 ; AVX512BW-FCP-NEXT: vzeroupper
832 ; AVX512BW-FCP-NEXT: retq
834 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf4:
835 ; AVX512DQ-BW: # %bb.0:
836 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
837 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
838 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
839 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
840 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
841 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
842 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
843 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
844 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
845 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
846 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
847 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
848 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
849 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
850 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,0,0,0,0]
851 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm1
852 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax)
853 ; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0
854 ; AVX512DQ-BW-NEXT: vmovq %xmm0, 48(%rax)
855 ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax)
856 ; AVX512DQ-BW-NEXT: vzeroupper
857 ; AVX512DQ-BW-NEXT: retq
859 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf4:
860 ; AVX512DQ-BW-FCP: # %bb.0:
861 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
862 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
863 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
864 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
865 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
866 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
867 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
868 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
869 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
870 ; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
871 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
872 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
873 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
874 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,0]
875 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm3, %ymm0, %ymm1
876 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,4,8,12,32,36,40,1,5,9,13,33,37,41,2,6,10,14,34,38,42,3,7,11,15,35,39,43,0,0,0,0]
877 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm0
878 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
879 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $3, %zmm0, %xmm1
880 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, 48(%rax)
881 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
882 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
883 ; AVX512DQ-BW-FCP-NEXT: retq
884 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
885 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64
886 %in.vec2 = load <4 x i16>, ptr %in.vecptr2, align 64
887 %in.vec3 = load <4 x i16>, ptr %in.vecptr3, align 64
888 %in.vec4 = load <4 x i16>, ptr %in.vecptr4, align 64
889 %in.vec5 = load <4 x i16>, ptr %in.vecptr5, align 64
890 %in.vec6 = load <4 x i16>, ptr %in.vecptr6, align 64
891 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
892 %2 = shufflevector <4 x i16> %in.vec2, <4 x i16> %in.vec3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
893 %3 = shufflevector <4 x i16> %in.vec4, <4 x i16> %in.vec5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
894 %4 = shufflevector <8 x i16> %1, <8 x i16> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
895 %5 = shufflevector <4 x i16> %in.vec6, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
896 %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
897 %7 = shufflevector <12 x i16> %6, <12 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef>
898 %8 = shufflevector <16 x i16> %4, <16 x i16> %7, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
899 %interleaved.vec = shufflevector <28 x i16> %8, <28 x i16> poison, <28 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27>
900 store <28 x i16> %interleaved.vec, ptr %out.vec, align 64
904 define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
905 ; SSE-LABEL: store_i16_stride7_vf8:
907 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
908 ; SSE-NEXT: movdqa (%rdi), %xmm3
909 ; SSE-NEXT: movdqa (%rsi), %xmm8
910 ; SSE-NEXT: movdqa (%rdx), %xmm5
911 ; SSE-NEXT: movdqa (%rcx), %xmm11
912 ; SSE-NEXT: movdqa (%r8), %xmm4
913 ; SSE-NEXT: movdqa (%r9), %xmm10
914 ; SSE-NEXT: movdqa (%rax), %xmm2
915 ; SSE-NEXT: movdqa %xmm5, %xmm0
916 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
917 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
918 ; SSE-NEXT: movdqa %xmm3, %xmm6
919 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
920 ; SSE-NEXT: movdqa %xmm6, %xmm1
921 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3]
922 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,3,2,3]
923 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,65535,65535,65535,65535]
924 ; SSE-NEXT: movdqa %xmm4, %xmm7
925 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
926 ; SSE-NEXT: movdqa %xmm10, %xmm13
927 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
928 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
929 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
930 ; SSE-NEXT: pand %xmm12, %xmm13
931 ; SSE-NEXT: pandn %xmm9, %xmm12
932 ; SSE-NEXT: por %xmm13, %xmm12
933 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm12[0,3]
934 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,65535,65535,65535,65535]
935 ; SSE-NEXT: pandn %xmm2, %xmm9
936 ; SSE-NEXT: movdqa %xmm7, %xmm12
937 ; SSE-NEXT: movdqa %xmm7, %xmm13
938 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
939 ; SSE-NEXT: por %xmm9, %xmm13
940 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2]
941 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,65535,0,65535]
942 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,4,4,4,4]
943 ; SSE-NEXT: pand %xmm14, %xmm15
944 ; SSE-NEXT: pandn %xmm9, %xmm14
945 ; SSE-NEXT: movaps {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535]
946 ; SSE-NEXT: por %xmm15, %xmm14
947 ; SSE-NEXT: movdqa %xmm6, %xmm15
948 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm14[3,3]
949 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7]
950 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm15[0,2]
951 ; SSE-NEXT: andps %xmm9, %xmm14
952 ; SSE-NEXT: andnps %xmm13, %xmm9
953 ; SSE-NEXT: orps %xmm14, %xmm9
954 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,1,0,1]
955 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535]
956 ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
957 ; SSE-NEXT: pand %xmm14, %xmm12
958 ; SSE-NEXT: pandn %xmm13, %xmm14
959 ; SSE-NEXT: por %xmm12, %xmm14
960 ; SSE-NEXT: movdqa %xmm11, %xmm12
961 ; SSE-NEXT: psrld $16, %xmm12
962 ; SSE-NEXT: movdqa %xmm5, %xmm13
963 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
964 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535]
965 ; SSE-NEXT: movdqa %xmm8, %xmm15
966 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1],xmm15[2],xmm3[2],xmm15[3],xmm3[3]
967 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
968 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
969 ; SSE-NEXT: pand %xmm12, %xmm15
970 ; SSE-NEXT: pandn %xmm13, %xmm12
971 ; SSE-NEXT: movdqa %xmm5, %xmm13
972 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
973 ; SSE-NEXT: por %xmm15, %xmm12
974 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,0,0,65535,65535]
975 ; SSE-NEXT: pand %xmm15, %xmm12
976 ; SSE-NEXT: pandn %xmm14, %xmm15
977 ; SSE-NEXT: movdqa %xmm4, %xmm14
978 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm10[4],xmm14[5],xmm10[5],xmm14[6],xmm10[6],xmm14[7],xmm10[7]
979 ; SSE-NEXT: por %xmm12, %xmm15
980 ; SSE-NEXT: psrlq $48, %xmm11
981 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm13 = xmm13[1],xmm11[1]
982 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,0,65535,65535,65535]
983 ; SSE-NEXT: pandn %xmm13, %xmm11
984 ; SSE-NEXT: movdqa %xmm6, %xmm13
985 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
986 ; SSE-NEXT: por %xmm11, %xmm13
987 ; SSE-NEXT: psrld $16, %xmm10
988 ; SSE-NEXT: movdqa %xmm4, %xmm11
989 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
990 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535]
991 ; SSE-NEXT: movdqa %xmm0, %xmm12
992 ; SSE-NEXT: pandn %xmm11, %xmm12
993 ; SSE-NEXT: por %xmm13, %xmm12
994 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [0,65535,65535,65535,65535,65535,65535,0]
995 ; SSE-NEXT: pand %xmm10, %xmm12
996 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[3,3,3,3]
997 ; SSE-NEXT: pandn %xmm11, %xmm10
998 ; SSE-NEXT: por %xmm12, %xmm10
999 ; SSE-NEXT: movdqa %xmm3, %xmm12
1000 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm3[1,1,1,1,4,5,6,7]
1001 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
1002 ; SSE-NEXT: psrld $16, %xmm8
1003 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
1004 ; SSE-NEXT: movdqa %xmm0, %xmm8
1005 ; SSE-NEXT: pandn %xmm12, %xmm8
1006 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,5,6,6]
1007 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1008 ; SSE-NEXT: pand %xmm0, %xmm12
1009 ; SSE-NEXT: por %xmm8, %xmm12
1010 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,0,0,0,65535,65535,65535]
1011 ; SSE-NEXT: pand %xmm8, %xmm12
1012 ; SSE-NEXT: movdqa %xmm7, %xmm13
1013 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,2],xmm2[1,1]
1014 ; SSE-NEXT: pandn %xmm13, %xmm8
1015 ; SSE-NEXT: por %xmm12, %xmm8
1016 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1017 ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9]
1018 ; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1019 ; SSE-NEXT: pand %xmm0, %xmm6
1020 ; SSE-NEXT: pandn %xmm12, %xmm0
1021 ; SSE-NEXT: por %xmm6, %xmm0
1022 ; SSE-NEXT: movaps %xmm2, %xmm6
1023 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,3],xmm14[0,1]
1024 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
1025 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm4[2,1]
1026 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,0,65535]
1027 ; SSE-NEXT: andps %xmm4, %xmm6
1028 ; SSE-NEXT: andnps %xmm0, %xmm4
1029 ; SSE-NEXT: orps %xmm6, %xmm4
1030 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm3[0]
1031 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm11[2,1]
1032 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,1,1]
1033 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
1034 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1035 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,0,0,0,65535]
1036 ; SSE-NEXT: andps %xmm2, %xmm5
1037 ; SSE-NEXT: andnps %xmm0, %xmm2
1038 ; SSE-NEXT: orps %xmm5, %xmm2
1039 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1040 ; SSE-NEXT: movaps %xmm2, (%rax)
1041 ; SSE-NEXT: movaps %xmm4, 64(%rax)
1042 ; SSE-NEXT: movdqa %xmm15, 16(%rax)
1043 ; SSE-NEXT: movdqa %xmm8, 32(%rax)
1044 ; SSE-NEXT: movaps %xmm9, 48(%rax)
1045 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3]
1046 ; SSE-NEXT: movaps %xmm1, 80(%rax)
1047 ; SSE-NEXT: movdqa %xmm10, 96(%rax)
1050 ; AVX-LABEL: store_i16_stride7_vf8:
1052 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1053 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1054 ; AVX-NEXT: vmovdqa (%rdi), %xmm7
1055 ; AVX-NEXT: vmovdqa (%rsi), %xmm8
1056 ; AVX-NEXT: vmovdqa (%rdx), %xmm2
1057 ; AVX-NEXT: vmovdqa (%rcx), %xmm4
1058 ; AVX-NEXT: vmovdqa (%r8), %xmm0
1059 ; AVX-NEXT: vmovdqa (%r9), %xmm3
1060 ; AVX-NEXT: vmovdqa (%r10), %xmm1
1061 ; AVX-NEXT: vpsrld $16, %xmm4, %xmm5
1062 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
1063 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
1064 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,2,2,2,4,5,6,7]
1065 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,5,4]
1066 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3,4,5,6,7]
1067 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1068 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[0,0,1,1]
1069 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1070 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7]
1071 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
1072 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3],xmm10[4,5,6,7]
1073 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
1074 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
1075 ; AVX-NEXT: vandps %ymm6, %ymm5, %ymm5
1076 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1077 ; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1078 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,0,1]
1079 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm12[5],xmm11[6,7]
1080 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,1,0,1]
1081 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,0,0,0]
1082 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6,7]
1083 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
1084 ; AVX-NEXT: vandnps %ymm11, %ymm6, %ymm6
1085 ; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5
1086 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,2,2]
1087 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm4[3,3,3,3,4,5,6,7]
1088 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
1089 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm6[6],xmm11[7]
1090 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1091 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[0,1,0,1]
1092 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4,5],xmm11[6,7]
1093 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm8
1094 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1095 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5,6,6]
1096 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
1097 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6],xmm8[7]
1098 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7
1099 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
1100 ; AVX-NEXT: vandps %ymm7, %ymm8, %ymm7
1101 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1102 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm1[3],xmm9[4,5,6,7]
1103 ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,2],xmm1[1,3]
1104 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
1105 ; AVX-NEXT: vandnps %ymm9, %ymm8, %ymm8
1106 ; AVX-NEXT: vorps %ymm7, %ymm8, %ymm7
1107 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
1108 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,2,3,3]
1109 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm8[4,5,6,7]
1110 ; AVX-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9]
1111 ; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1112 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6],xmm10[7]
1113 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
1114 ; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
1115 ; AVX-NEXT: vandnps %ymm8, %ymm9, %ymm8
1116 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
1117 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
1118 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
1119 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,3,2,3]
1120 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7]
1121 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1122 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
1123 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
1124 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[2,2,3,3]
1125 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7]
1126 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
1127 ; AVX-NEXT: vandps %ymm9, %ymm10, %ymm9
1128 ; AVX-NEXT: vorps %ymm8, %ymm9, %ymm8
1129 ; AVX-NEXT: vpsrlq $48, %xmm4, %xmm4
1130 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm4[1]
1131 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1132 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
1133 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm3
1134 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1135 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1136 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13]
1137 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4],xmm0[5,6,7]
1138 ; AVX-NEXT: vmovdqa %xmm0, 96(%rax)
1139 ; AVX-NEXT: vmovaps %ymm8, 64(%rax)
1140 ; AVX-NEXT: vmovaps %ymm7, 32(%rax)
1141 ; AVX-NEXT: vmovaps %ymm5, (%rax)
1142 ; AVX-NEXT: vzeroupper
1145 ; AVX2-LABEL: store_i16_stride7_vf8:
1147 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1148 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1149 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1150 ; AVX2-NEXT: vmovdqa (%rsi), %xmm2
1151 ; AVX2-NEXT: vmovdqa (%rdx), %xmm4
1152 ; AVX2-NEXT: vmovdqa (%rcx), %xmm5
1153 ; AVX2-NEXT: vmovdqa (%r8), %xmm1
1154 ; AVX2-NEXT: vmovdqa (%r9), %xmm3
1155 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
1156 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
1157 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9
1158 ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm8[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25]
1159 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
1160 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
1161 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,2,2,4,5,6,6]
1162 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15]
1163 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
1164 ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1165 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
1166 ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6
1167 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
1168 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
1169 ; AVX2-NEXT: vpbroadcastd 4(%r10), %ymm12
1170 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
1171 ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
1172 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
1173 ; AVX2-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
1174 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3]
1175 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero
1176 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3]
1177 ; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1178 ; AVX2-NEXT: vpor %ymm11, %ymm12, %ymm11
1179 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
1180 ; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
1181 ; AVX2-NEXT: vpbroadcastd 8(%r10), %ymm12
1182 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
1183 ; AVX2-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
1184 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
1185 ; AVX2-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
1186 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
1187 ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1188 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0]
1189 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21]
1190 ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7
1191 ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
1192 ; AVX2-NEXT: vpbroadcastd (%r10), %ymm10
1193 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
1194 ; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
1195 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
1196 ; AVX2-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1197 ; AVX2-NEXT: vpsrlq $48, %xmm5, %xmm5
1198 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1]
1199 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1200 ; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1201 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7]
1202 ; AVX2-NEXT: vpsrld $16, %xmm3, %xmm2
1203 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1204 ; AVX2-NEXT: vpbroadcastd 12(%r10), %xmm2
1205 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
1206 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1207 ; AVX2-NEXT: vmovdqa %xmm0, 96(%rax)
1208 ; AVX2-NEXT: vmovdqa %ymm7, (%rax)
1209 ; AVX2-NEXT: vmovdqa %ymm9, 64(%rax)
1210 ; AVX2-NEXT: vmovdqa %ymm6, 32(%rax)
1211 ; AVX2-NEXT: vzeroupper
1214 ; AVX2-FP-LABEL: store_i16_stride7_vf8:
1216 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1217 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1218 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
1219 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm2
1220 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm4
1221 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm5
1222 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1
1223 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm3
1224 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
1225 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
1226 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9
1227 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,4,5,u,u,u,u,u,u,u,u,8,9,6,7,6,7,20,21,u,u,u,u,u,u,u,u,24,25,8,9]
1228 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm8, %ymm10
1229 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1]
1230 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm11, %ymm6
1231 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1],ymm10[2,3,4,5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11,12,13],ymm6[14],ymm10[15]
1232 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
1233 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
1234 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
1235 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm6, %ymm10, %ymm6
1236 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
1237 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
1238 ; AVX2-FP-NEXT: vpbroadcastd 4(%r10), %ymm12
1239 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
1240 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
1241 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
1242 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
1243 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3]
1244 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero
1245 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3]
1246 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1247 ; AVX2-FP-NEXT: vpor %ymm11, %ymm12, %ymm11
1248 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
1249 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
1250 ; AVX2-FP-NEXT: vpbroadcastd 8(%r10), %ymm12
1251 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
1252 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
1253 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
1254 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
1255 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
1256 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1257 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0]
1258 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21]
1259 ; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7
1260 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
1261 ; AVX2-FP-NEXT: vpbroadcastd (%r10), %ymm10
1262 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
1263 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
1264 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
1265 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1266 ; AVX2-FP-NEXT: vpsrlq $48, %xmm5, %xmm5
1267 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1]
1268 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1269 ; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1270 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7]
1271 ; AVX2-FP-NEXT: vpsrld $16, %xmm3, %xmm2
1272 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1273 ; AVX2-FP-NEXT: vpbroadcastd 12(%r10), %xmm2
1274 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
1275 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1276 ; AVX2-FP-NEXT: vmovdqa %xmm0, 96(%rax)
1277 ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rax)
1278 ; AVX2-FP-NEXT: vmovdqa %ymm9, 64(%rax)
1279 ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax)
1280 ; AVX2-FP-NEXT: vzeroupper
1281 ; AVX2-FP-NEXT: retq
1283 ; AVX2-FCP-LABEL: store_i16_stride7_vf8:
1284 ; AVX2-FCP: # %bb.0:
1285 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1286 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1287 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
1288 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2
1289 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4
1290 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm5
1291 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
1292 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm3
1293 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
1294 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
1295 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm9
1296 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3]
1297 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero
1298 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,5,0,0,5,2,6,0]
1299 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm10
1300 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25]
1301 ; AVX2-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6
1302 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
1303 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
1304 ; AVX2-FCP-NEXT: vpbroadcastd 4(%r10), %ymm12
1305 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
1306 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
1307 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
1308 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
1309 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[1,3,1,3]
1310 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[20,21,28,29],zero,zero,zero,zero
1311 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm7[1,3,1,3]
1312 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[2,3,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[20,21,28,29],zero,zero,zero,zero,zero,zero,zero,zero
1313 ; AVX2-FCP-NEXT: vpor %ymm11, %ymm12, %ymm11
1314 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
1315 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
1316 ; AVX2-FCP-NEXT: vpbroadcastd 8(%r10), %ymm12
1317 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
1318 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
1319 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
1320 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
1321 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
1322 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[18,19,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1323 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,0]
1324 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[28,29,20,21]
1325 ; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
1326 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
1327 ; AVX2-FCP-NEXT: vpbroadcastd (%r10), %ymm10
1328 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
1329 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
1330 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
1331 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
1332 ; AVX2-FCP-NEXT: vpsrlq $48, %xmm5, %xmm5
1333 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1]
1334 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1335 ; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1336 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4],xmm0[5,6,7]
1337 ; AVX2-FCP-NEXT: vpsrld $16, %xmm3, %xmm2
1338 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1339 ; AVX2-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
1340 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
1341 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1342 ; AVX2-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
1343 ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax)
1344 ; AVX2-FCP-NEXT: vmovdqa %ymm9, 64(%rax)
1345 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax)
1346 ; AVX2-FCP-NEXT: vzeroupper
1347 ; AVX2-FCP-NEXT: retq
1349 ; AVX512-LABEL: store_i16_stride7_vf8:
1351 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1352 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1353 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
1354 ; AVX512-NEXT: vmovdqa (%rsi), %xmm1
1355 ; AVX512-NEXT: vmovdqa (%rdx), %xmm3
1356 ; AVX512-NEXT: vmovdqa (%rcx), %xmm4
1357 ; AVX512-NEXT: vmovdqa (%r8), %xmm5
1358 ; AVX512-NEXT: vmovdqa (%r9), %xmm6
1359 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
1360 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2
1361 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
1362 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2]
1363 ; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
1364 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
1365 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
1366 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
1367 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25]
1368 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
1369 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
1370 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6]
1371 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15]
1372 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
1373 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0]
1374 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21]
1375 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1376 ; AVX512-NEXT: vporq %zmm9, %zmm10, %zmm9
1377 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
1378 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u]
1379 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u]
1380 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1381 ; AVX512-NEXT: vpbroadcastd (%r10), %ymm11
1382 ; AVX512-NEXT: vpbroadcastd 4(%r10), %ymm12
1383 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
1384 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
1385 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
1386 ; AVX512-NEXT: vpsrlq $48, %xmm4, %xmm4
1387 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
1388 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1389 ; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1390 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
1391 ; AVX512-NEXT: vpsrld $16, %xmm6, %xmm1
1392 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
1393 ; AVX512-NEXT: vpbroadcastd 12(%r10), %xmm3
1394 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
1395 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1396 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1397 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
1398 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
1399 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
1400 ; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
1401 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
1402 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
1403 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
1404 ; AVX512-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
1405 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
1406 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
1407 ; AVX512-NEXT: vmovdqa %xmm0, 96(%rax)
1408 ; AVX512-NEXT: vmovdqa %ymm1, 64(%rax)
1409 ; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
1410 ; AVX512-NEXT: vzeroupper
1413 ; AVX512-FCP-LABEL: store_i16_stride7_vf8:
1414 ; AVX512-FCP: # %bb.0:
1415 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1416 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1417 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
1418 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
1419 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
1420 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
1421 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm4
1422 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5
1423 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
1424 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
1425 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
1426 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3
1427 ; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1428 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1429 ; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1430 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
1431 ; AVX512-FCP-NEXT: vpsrld $16, %xmm5, %xmm1
1432 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1433 ; AVX512-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
1434 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
1435 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1436 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3]
1437 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
1438 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3]
1439 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
1440 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
1441 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
1442 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
1443 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
1444 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
1445 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
1446 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
1447 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
1448 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
1449 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3]
1450 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
1451 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1452 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
1453 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
1454 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0]
1455 ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
1456 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
1457 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1458 ; AVX512-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2
1459 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2]
1460 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u]
1461 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u]
1462 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1463 ; AVX512-FCP-NEXT: vpbroadcastd (%r10), %ymm4
1464 ; AVX512-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5
1465 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
1466 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3
1467 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
1468 ; AVX512-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
1469 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
1470 ; AVX512-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
1471 ; AVX512-FCP-NEXT: vzeroupper
1472 ; AVX512-FCP-NEXT: retq
1474 ; AVX512DQ-LABEL: store_i16_stride7_vf8:
1475 ; AVX512DQ: # %bb.0:
1476 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1477 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1478 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
1479 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
1480 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
1481 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm4
1482 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5
1483 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6
1484 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
1485 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm2
1486 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm8
1487 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,2,0,2]
1488 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
1489 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
1490 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,ymm10[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm10[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
1491 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
1492 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[4,5,u,u,u,u,u,u,u,u,u,u,u,u,6,7,22,23,u,u,u,u,u,u,u,u,u,u,u,u,24,25]
1493 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
1494 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[2,2,2,2,4,5,6,7,10,10,10,10,12,13,14,15]
1495 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,1,2,2,4,5,6,6]
1496 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11,12,13],ymm11[14],ymm10[15]
1497 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
1498 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,2,2,0]
1499 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,8,9],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm11[u,u,u,u,u,u,28,29,20,21]
1500 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1501 ; AVX512DQ-NEXT: vporq %zmm9, %zmm10, %zmm9
1502 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm8[0,2,0,2]
1503 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm10[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm10[u,u,u,u]
1504 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,4,5,12,13],zero,zero,ymm10[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm10[u,u,u,u,u,u,u,u]
1505 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
1506 ; AVX512DQ-NEXT: vpbroadcastd (%r10), %ymm11
1507 ; AVX512DQ-NEXT: vpbroadcastd 4(%r10), %ymm12
1508 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
1509 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = (zmm11 & mem) | zmm10
1510 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm9))
1511 ; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm4
1512 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
1513 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1514 ; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1515 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
1516 ; AVX512DQ-NEXT: vpsrld $16, %xmm6, %xmm1
1517 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
1518 ; AVX512DQ-NEXT: vpbroadcastd 12(%r10), %xmm3
1519 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6],xmm3[7]
1520 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1521 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm2[1,3,1,3]
1522 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
1523 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm7[1,3,1,3]
1524 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
1525 ; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
1526 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
1527 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
1528 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
1529 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
1530 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
1531 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
1532 ; AVX512DQ-NEXT: vmovdqa %xmm0, 96(%rax)
1533 ; AVX512DQ-NEXT: vmovdqa %ymm1, 64(%rax)
1534 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
1535 ; AVX512DQ-NEXT: vzeroupper
1536 ; AVX512DQ-NEXT: retq
1538 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf8:
1539 ; AVX512DQ-FCP: # %bb.0:
1540 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1541 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1542 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
1543 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
1544 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
1545 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
1546 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm4
1547 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5
1548 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
1549 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm7
1550 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm8
1551 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm3, %xmm3
1552 ; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
1553 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1554 ; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1555 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
1556 ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm5, %xmm1
1557 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1558 ; AVX512DQ-FCP-NEXT: vpbroadcastd 12(%r10), %xmm2
1559 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6],xmm2[7]
1560 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4],xmm1[5,6,7]
1561 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm7[1,3,1,3]
1562 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u],zero,zero,zero,zero,ymm1[2,3,10,11,u,u,u,u,u,u],zero,zero,zero,zero,ymm1[20,21,28,29,u,u,u,u]
1563 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm6[1,3,1,3]
1564 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,2,3,10,11],zero,zero,zero,zero,ymm2[u,u,u,u,u,u,20,21,28,29],zero,zero,zero,zero,ymm2[u,u,u,u]
1565 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
1566 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm8[1,3,3,1]
1567 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,8,9],zero,zero,ymm2[u,u,u,u,u,u,u,u,2,3,18,19],zero,zero,ymm2[u,u,u,u,u,u,u,u,28,29,20,21]
1568 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
1569 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm3 = (mem & ~ymm3) | ymm2
1570 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (mem & (ymm3 ^ ymm1))
1571 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
1572 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,2,0,2]
1573 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[18,19,26,27,u,u,u,u,u,u],zero,zero,zero,zero
1574 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,1,3]
1575 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[u,u,u,u,u,u,6,7,14,15],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,16,17,24,25],zero,zero,zero,zero
1576 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
1577 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
1578 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
1579 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,5,0,0,5,2,6,0]
1580 ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
1581 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
1582 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
1583 ; AVX512DQ-FCP-NEXT: vporq %zmm2, %zmm3, %zmm2
1584 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,2,0,2]
1585 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,0,1,8,9],zero,zero,ymm3[u,u,u,u,u,u,u,u,18,19,26,27],zero,zero,ymm3[u,u,u,u]
1586 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,4,5,12,13],zero,zero,ymm3[u,u,u,u,u,u,u,u,22,23,30,31],zero,zero,ymm3[u,u,u,u,u,u,u,u]
1587 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1588 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%r10), %ymm4
1589 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%r10), %ymm5
1590 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
1591 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & mem) | zmm3
1592 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
1593 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 96(%rax)
1594 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
1595 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
1596 ; AVX512DQ-FCP-NEXT: vzeroupper
1597 ; AVX512DQ-FCP-NEXT: retq
1599 ; AVX512BW-LABEL: store_i16_stride7_vf8:
1600 ; AVX512BW: # %bb.0:
1601 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1602 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1603 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
1604 ; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
1605 ; AVX512BW-NEXT: vmovdqa (%r8), %xmm2
1606 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1607 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1608 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1609 ; AVX512BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1
1610 ; AVX512BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1
1611 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28]
1612 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1613 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0]
1614 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1615 ; AVX512BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax)
1616 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax)
1617 ; AVX512BW-NEXT: vmovdqa %ymm3, 64(%rax)
1618 ; AVX512BW-NEXT: vzeroupper
1619 ; AVX512BW-NEXT: retq
1621 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf8:
1622 ; AVX512BW-FCP: # %bb.0:
1623 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1624 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1625 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1626 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
1627 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm2
1628 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1629 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1630 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1631 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1
1632 ; AVX512BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1
1633 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28]
1634 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1635 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0]
1636 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1637 ; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax)
1638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
1639 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
1640 ; AVX512BW-FCP-NEXT: vzeroupper
1641 ; AVX512BW-FCP-NEXT: retq
1643 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf8:
1644 ; AVX512DQ-BW: # %bb.0:
1645 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1646 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1647 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
1648 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
1649 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm2
1650 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1651 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1652 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1653 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1
1654 ; AVX512DQ-BW-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1
1655 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28]
1656 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1657 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0]
1658 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1659 ; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm3, 96(%rax)
1660 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax)
1661 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, 64(%rax)
1662 ; AVX512DQ-BW-NEXT: vzeroupper
1663 ; AVX512DQ-BW-NEXT: retq
1665 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf8:
1666 ; AVX512DQ-BW-FCP: # %bb.0:
1667 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1668 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1669 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
1670 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
1671 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm2
1672 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1
1673 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0
1674 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1675 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm1
1676 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, (%r10), %zmm1, %zmm1
1677 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28]
1678 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1679 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,0,0,0,0,0,0,0,0]
1680 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1681 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm3, 96(%rax)
1682 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
1683 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
1684 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1685 ; AVX512DQ-BW-FCP-NEXT: retq
1686 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64
1687 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64
1688 %in.vec2 = load <8 x i16>, ptr %in.vecptr2, align 64
1689 %in.vec3 = load <8 x i16>, ptr %in.vecptr3, align 64
1690 %in.vec4 = load <8 x i16>, ptr %in.vecptr4, align 64
1691 %in.vec5 = load <8 x i16>, ptr %in.vecptr5, align 64
1692 %in.vec6 = load <8 x i16>, ptr %in.vecptr6, align 64
1693 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1694 %2 = shufflevector <8 x i16> %in.vec2, <8 x i16> %in.vec3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1695 %3 = shufflevector <8 x i16> %in.vec4, <8 x i16> %in.vec5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1696 %4 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1697 %5 = shufflevector <8 x i16> %in.vec6, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1698 %6 = shufflevector <16 x i16> %3, <16 x i16> %5, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
1699 %7 = shufflevector <24 x i16> %6, <24 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
1700 %8 = shufflevector <32 x i16> %4, <32 x i16> %7, <56 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
1701 %interleaved.vec = shufflevector <56 x i16> %8, <56 x i16> poison, <56 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55>
1702 store <56 x i16> %interleaved.vec, ptr %out.vec, align 64
1706 define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
1707 ; SSE-LABEL: store_i16_stride7_vf16:
1709 ; SSE-NEXT: subq $216, %rsp
1710 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1711 ; SSE-NEXT: movdqa 16(%rdi), %xmm6
1712 ; SSE-NEXT: movdqa 16(%rsi), %xmm9
1713 ; SSE-NEXT: movdqa 16(%rdx), %xmm15
1714 ; SSE-NEXT: movdqa 16(%rcx), %xmm1
1715 ; SSE-NEXT: movdqa 16(%r8), %xmm8
1716 ; SSE-NEXT: movdqa 16(%r9), %xmm7
1717 ; SSE-NEXT: movdqa 16(%rax), %xmm3
1718 ; SSE-NEXT: movdqa %xmm1, %xmm0
1719 ; SSE-NEXT: movdqa %xmm1, %xmm5
1720 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1721 ; SSE-NEXT: psrlq $48, %xmm0
1722 ; SSE-NEXT: movdqa %xmm15, %xmm1
1723 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1]
1724 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
1725 ; SSE-NEXT: pandn %xmm1, %xmm2
1726 ; SSE-NEXT: movdqa %xmm6, %xmm0
1727 ; SSE-NEXT: movdqa %xmm6, %xmm10
1728 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1730 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1731 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1732 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1733 ; SSE-NEXT: por %xmm2, %xmm0
1734 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1735 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,3,3]
1736 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,65535,65535]
1737 ; SSE-NEXT: movdqa %xmm4, %xmm2
1738 ; SSE-NEXT: pandn %xmm1, %xmm2
1739 ; SSE-NEXT: por %xmm0, %xmm2
1740 ; SSE-NEXT: movdqa %xmm8, %xmm1
1741 ; SSE-NEXT: movdqa %xmm7, %xmm6
1742 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1743 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1744 ; SSE-NEXT: movdqa %xmm1, %xmm7
1745 ; SSE-NEXT: movdqa %xmm6, %xmm1
1746 ; SSE-NEXT: psrld $16, %xmm1
1747 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[2,0]
1748 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
1749 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [0,65535,65535,65535,65535,65535,65535,0]
1750 ; SSE-NEXT: andps %xmm6, %xmm0
1751 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1752 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3]
1753 ; SSE-NEXT: andnps %xmm1, %xmm6
1754 ; SSE-NEXT: orps %xmm0, %xmm6
1755 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1756 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,1]
1757 ; SSE-NEXT: movdqa %xmm4, %xmm1
1758 ; SSE-NEXT: pandn %xmm0, %xmm1
1759 ; SSE-NEXT: movdqa %xmm7, %xmm0
1760 ; SSE-NEXT: movdqa %xmm7, %xmm8
1761 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1762 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1763 ; SSE-NEXT: pand %xmm4, %xmm0
1764 ; SSE-NEXT: por %xmm1, %xmm0
1765 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,0,0,65535,65535]
1766 ; SSE-NEXT: movdqa %xmm7, %xmm1
1767 ; SSE-NEXT: pandn %xmm0, %xmm1
1768 ; SSE-NEXT: movdqa %xmm5, %xmm0
1769 ; SSE-NEXT: psrld $16, %xmm0
1770 ; SSE-NEXT: movdqa %xmm15, %xmm3
1771 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1772 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535]
1773 ; SSE-NEXT: movdqa %xmm0, %xmm6
1774 ; SSE-NEXT: pandn %xmm3, %xmm6
1775 ; SSE-NEXT: movdqa %xmm9, %xmm3
1776 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
1777 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
1778 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
1779 ; SSE-NEXT: pand %xmm0, %xmm3
1780 ; SSE-NEXT: por %xmm6, %xmm3
1781 ; SSE-NEXT: pand %xmm7, %xmm3
1782 ; SSE-NEXT: por %xmm1, %xmm3
1783 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1784 ; SSE-NEXT: movdqa (%rax), %xmm9
1785 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,2,3]
1786 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
1787 ; SSE-NEXT: movdqa %xmm11, %xmm3
1788 ; SSE-NEXT: pandn %xmm1, %xmm3
1789 ; SSE-NEXT: movdqa (%r8), %xmm2
1790 ; SSE-NEXT: movdqa (%r9), %xmm10
1791 ; SSE-NEXT: movdqa %xmm10, %xmm1
1792 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1793 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1794 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1795 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
1796 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,5,4]
1797 ; SSE-NEXT: pand %xmm11, %xmm6
1798 ; SSE-NEXT: por %xmm3, %xmm6
1799 ; SSE-NEXT: movdqa (%rdx), %xmm13
1800 ; SSE-NEXT: movdqa (%rcx), %xmm5
1801 ; SSE-NEXT: movdqa %xmm13, %xmm14
1802 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
1803 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1804 ; SSE-NEXT: movdqa (%rdi), %xmm11
1805 ; SSE-NEXT: movdqa (%rsi), %xmm1
1806 ; SSE-NEXT: movdqa %xmm11, %xmm12
1807 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1808 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
1809 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1810 ; SSE-NEXT: movdqa %xmm12, %xmm3
1811 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm14[2,3]
1812 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm6[0,3]
1813 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1814 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
1815 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
1816 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1817 ; SSE-NEXT: pand %xmm4, %xmm2
1818 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,1,0,1]
1819 ; SSE-NEXT: movdqa %xmm9, %xmm14
1820 ; SSE-NEXT: pandn %xmm6, %xmm4
1821 ; SSE-NEXT: por %xmm2, %xmm4
1822 ; SSE-NEXT: movdqa %xmm5, %xmm3
1823 ; SSE-NEXT: movdqa %xmm5, %xmm2
1824 ; SSE-NEXT: psrld $16, %xmm3
1825 ; SSE-NEXT: movdqa %xmm13, %xmm6
1826 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1827 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
1828 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
1829 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
1830 ; SSE-NEXT: pand %xmm0, %xmm3
1831 ; SSE-NEXT: pandn %xmm6, %xmm0
1832 ; SSE-NEXT: por %xmm3, %xmm0
1833 ; SSE-NEXT: pand %xmm7, %xmm0
1834 ; SSE-NEXT: pandn %xmm4, %xmm7
1835 ; SSE-NEXT: por %xmm0, %xmm7
1836 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1837 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535]
1838 ; SSE-NEXT: movdqa %xmm7, %xmm0
1839 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1840 ; SSE-NEXT: movdqa %xmm8, %xmm3
1841 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1842 ; SSE-NEXT: por %xmm0, %xmm3
1843 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,0,0,0,65535,65535,65535,65535]
1844 ; SSE-NEXT: movdqa %xmm8, %xmm6
1845 ; SSE-NEXT: pandn %xmm3, %xmm6
1846 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,2,2]
1847 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
1848 ; SSE-NEXT: movdqa %xmm0, %xmm4
1849 ; SSE-NEXT: pandn %xmm3, %xmm4
1850 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1851 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4]
1852 ; SSE-NEXT: pand %xmm0, %xmm3
1853 ; SSE-NEXT: por %xmm4, %xmm3
1854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1855 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm3[3,3]
1856 ; SSE-NEXT: movdqa %xmm15, %xmm10
1857 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
1858 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
1859 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1860 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
1861 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm9[0,2]
1862 ; SSE-NEXT: andps %xmm8, %xmm1
1863 ; SSE-NEXT: orps %xmm6, %xmm1
1864 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1865 ; SSE-NEXT: movdqa %xmm14, %xmm11
1866 ; SSE-NEXT: pandn %xmm14, %xmm7
1867 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
1868 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1869 ; SSE-NEXT: por %xmm7, %xmm3
1870 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,4,4]
1871 ; SSE-NEXT: pand %xmm0, %xmm4
1872 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm13[2,2,2,2]
1873 ; SSE-NEXT: pandn %xmm5, %xmm0
1874 ; SSE-NEXT: por %xmm4, %xmm0
1875 ; SSE-NEXT: movdqa %xmm12, %xmm4
1876 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[3,3]
1877 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7]
1878 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2]
1879 ; SSE-NEXT: andps %xmm8, %xmm0
1880 ; SSE-NEXT: pandn %xmm3, %xmm8
1881 ; SSE-NEXT: por %xmm0, %xmm8
1882 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1883 ; SSE-NEXT: movdqa %xmm13, %xmm0
1884 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
1885 ; SSE-NEXT: psrlq $48, %xmm2
1886 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1887 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
1888 ; SSE-NEXT: pandn %xmm0, %xmm1
1889 ; SSE-NEXT: movdqa %xmm12, %xmm0
1890 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1891 ; SSE-NEXT: por %xmm1, %xmm0
1892 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1893 ; SSE-NEXT: movdqa %xmm3, %xmm2
1894 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1895 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1896 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1897 ; SSE-NEXT: psrld $16, %xmm1
1898 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1899 ; SSE-NEXT: movdqa %xmm14, %xmm1
1900 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
1901 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [0,65535,65535,65535,65535,0,0,0]
1902 ; SSE-NEXT: pand %xmm8, %xmm0
1903 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
1904 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
1905 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
1906 ; SSE-NEXT: pandn %xmm1, %xmm8
1907 ; SSE-NEXT: por %xmm0, %xmm8
1908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1909 ; SSE-NEXT: movaps %xmm4, %xmm0
1910 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm10[2,0]
1911 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1912 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,0]
1913 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1914 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
1915 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,6,7]
1916 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0]
1917 ; SSE-NEXT: andps %xmm2, %xmm1
1918 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
1919 ; SSE-NEXT: andnps %xmm0, %xmm2
1920 ; SSE-NEXT: orps %xmm1, %xmm2
1921 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
1922 ; SSE-NEXT: andps %xmm1, %xmm2
1923 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
1924 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3]
1925 ; SSE-NEXT: andnps %xmm0, %xmm1
1926 ; SSE-NEXT: orps %xmm2, %xmm1
1927 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1928 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1929 ; SSE-NEXT: movdqa %xmm0, %xmm1
1930 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[1,1,1,1,4,5,6,7]
1931 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1932 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1933 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1934 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1935 ; SSE-NEXT: movdqa %xmm2, %xmm0
1936 ; SSE-NEXT: psrld $16, %xmm0
1937 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1938 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,65535]
1939 ; SSE-NEXT: movdqa %xmm0, %xmm2
1940 ; SSE-NEXT: pandn %xmm1, %xmm2
1941 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5,6,6]
1942 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1943 ; SSE-NEXT: pand %xmm0, %xmm1
1944 ; SSE-NEXT: por %xmm2, %xmm1
1945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1946 ; SSE-NEXT: movaps %xmm14, %xmm5
1947 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm7[1,1]
1948 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,0,0,0,65535,65535,65535]
1949 ; SSE-NEXT: movaps %xmm6, %xmm3
1950 ; SSE-NEXT: andnps %xmm5, %xmm3
1951 ; SSE-NEXT: pand %xmm6, %xmm1
1952 ; SSE-NEXT: orps %xmm1, %xmm3
1953 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
1954 ; SSE-NEXT: movdqa %xmm15, %xmm1
1955 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm15[1,1,1,1,4,5,6,7]
1956 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1957 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1958 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
1959 ; SSE-NEXT: psrld $16, %xmm9
1960 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
1961 ; SSE-NEXT: movdqa %xmm0, %xmm9
1962 ; SSE-NEXT: pandn %xmm1, %xmm9
1963 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5,6,6]
1964 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
1965 ; SSE-NEXT: pand %xmm0, %xmm1
1966 ; SSE-NEXT: por %xmm9, %xmm1
1967 ; SSE-NEXT: movaps (%rsp), %xmm5 # 16-byte Reload
1968 ; SSE-NEXT: movaps %xmm5, %xmm9
1969 ; SSE-NEXT: movdqa %xmm11, %xmm2
1970 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,2],xmm11[1,1]
1971 ; SSE-NEXT: pand %xmm6, %xmm1
1972 ; SSE-NEXT: andnps %xmm9, %xmm6
1973 ; SSE-NEXT: orps %xmm1, %xmm6
1974 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1975 ; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9]
1976 ; SSE-NEXT: movdqa %xmm0, %xmm1
1977 ; SSE-NEXT: pandn %xmm9, %xmm1
1978 ; SSE-NEXT: pslldq {{.*#+}} xmm12 = zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1979 ; SSE-NEXT: pand %xmm0, %xmm12
1980 ; SSE-NEXT: por %xmm1, %xmm12
1981 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,0,65535]
1982 ; SSE-NEXT: movdqa %xmm1, %xmm9
1983 ; SSE-NEXT: pandn %xmm12, %xmm9
1984 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
1985 ; SSE-NEXT: # xmm11 = xmm11[1],mem[0]
1986 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
1987 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7]
1988 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm12[2,1]
1989 ; SSE-NEXT: andps %xmm1, %xmm11
1990 ; SSE-NEXT: orps %xmm9, %xmm11
1991 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9]
1992 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
1993 ; SSE-NEXT: pslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
1994 ; SSE-NEXT: pand %xmm0, %xmm9
1995 ; SSE-NEXT: pandn %xmm10, %xmm0
1996 ; SSE-NEXT: por %xmm9, %xmm0
1997 ; SSE-NEXT: movdqa %xmm7, %xmm9
1998 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,3],xmm4[0,1]
1999 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2000 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
2001 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,1]
2002 ; SSE-NEXT: andps %xmm1, %xmm7
2003 ; SSE-NEXT: pandn %xmm0, %xmm1
2004 ; SSE-NEXT: por %xmm7, %xmm1
2005 ; SSE-NEXT: punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0]
2006 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2007 ; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1]
2008 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,1,1]
2009 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,1,1]
2010 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
2011 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535]
2012 ; SSE-NEXT: movdqa %xmm4, %xmm5
2013 ; SSE-NEXT: pandn %xmm0, %xmm5
2014 ; SSE-NEXT: andps %xmm4, %xmm13
2015 ; SSE-NEXT: por %xmm13, %xmm5
2016 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2017 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2018 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
2019 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2020 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
2021 ; SSE-NEXT: movaps %xmm0, %xmm2
2022 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,1,1]
2023 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm9[0,0,1,1]
2024 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2025 ; SSE-NEXT: andps %xmm4, %xmm2
2026 ; SSE-NEXT: pandn %xmm0, %xmm4
2027 ; SSE-NEXT: por %xmm2, %xmm4
2028 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2029 ; SSE-NEXT: movdqa %xmm4, 112(%rax)
2030 ; SSE-NEXT: movdqa %xmm5, (%rax)
2031 ; SSE-NEXT: movdqa %xmm1, 176(%rax)
2032 ; SSE-NEXT: movaps %xmm11, 64(%rax)
2033 ; SSE-NEXT: movaps %xmm6, 32(%rax)
2034 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2035 ; SSE-NEXT: movaps %xmm0, 48(%rax)
2036 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2037 ; SSE-NEXT: movaps %xmm0, 160(%rax)
2038 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2039 ; SSE-NEXT: movaps %xmm0, 16(%rax)
2040 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2041 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
2042 ; SSE-NEXT: movaps %xmm0, 80(%rax)
2043 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2044 ; SSE-NEXT: movaps %xmm0, 128(%rax)
2045 ; SSE-NEXT: movaps %xmm3, 144(%rax)
2046 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2047 ; SSE-NEXT: movaps %xmm0, 192(%rax)
2048 ; SSE-NEXT: movdqa %xmm8, 96(%rax)
2049 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2050 ; SSE-NEXT: movaps %xmm0, 208(%rax)
2051 ; SSE-NEXT: addq $216, %rsp
2054 ; AVX-LABEL: store_i16_stride7_vf16:
2056 ; AVX-NEXT: subq $40, %rsp
2057 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2058 ; AVX-NEXT: vmovdqa (%rdx), %xmm5
2059 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm14
2060 ; AVX-NEXT: vmovdqa (%rcx), %xmm7
2061 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm15
2062 ; AVX-NEXT: vpsrlq $48, %xmm15, %xmm0
2063 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm14[1],xmm0[1]
2064 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
2065 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
2066 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
2067 ; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
2068 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm3
2069 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2070 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm4
2071 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2072 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2073 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3]
2074 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2075 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
2076 ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
2077 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
2078 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535]
2079 ; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2
2080 ; AVX-NEXT: vmovdqa 16(%r8), %xmm3
2081 ; AVX-NEXT: vmovdqa 16(%r9), %xmm8
2082 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm10
2083 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm3[2],xmm10[2],xmm3[3],xmm10[3]
2084 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
2085 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
2086 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
2087 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
2088 ; AVX-NEXT: vandnps %ymm10, %ymm6, %ymm6
2089 ; AVX-NEXT: vorps %ymm6, %ymm2, %ymm2
2090 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm10
2091 ; AVX-NEXT: vmovdqa 16(%rax), %xmm13
2092 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
2093 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1,2,3,4,5,6],xmm12[7]
2094 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2095 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3]
2096 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7]
2097 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2098 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
2099 ; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2100 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9
2101 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[2,2,2,2]
2102 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,3,3,3,4,5,6,7]
2103 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
2104 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm2[6],xmm10[7]
2105 ; AVX-NEXT: vmovdqa (%rsi), %xmm6
2106 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
2107 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm10, %ymm10
2108 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
2109 ; AVX-NEXT: vandnps %ymm9, %ymm0, %ymm9
2110 ; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0
2111 ; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0
2112 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
2113 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7]
2114 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
2115 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,2,3,3]
2116 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5,6,7]
2117 ; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
2118 ; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0
2119 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
2120 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2121 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm13[3],xmm3[4,5,6,7]
2122 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3
2123 ; AVX-NEXT: vandnps %ymm3, %ymm10, %ymm3
2124 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
2125 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2126 ; AVX-NEXT: vpsrld $16, %xmm6, %xmm0
2127 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2128 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
2129 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,1,0,1]
2130 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
2131 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2132 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,3,3,3,4,5,6,7]
2133 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
2134 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2135 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[2,2,2,2]
2136 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm10[6],xmm3[7]
2137 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
2138 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,4,5,6,6]
2139 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
2140 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
2141 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
2142 ; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0
2143 ; AVX-NEXT: vandps %ymm1, %ymm10, %ymm10
2144 ; AVX-NEXT: vorps %ymm0, %ymm10, %ymm1
2145 ; AVX-NEXT: vmovdqa (%r8), %xmm10
2146 ; AVX-NEXT: vmovdqa (%r9), %xmm5
2147 ; AVX-NEXT: vmovdqa (%rax), %xmm12
2148 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
2149 ; AVX-NEXT: vpsrldq {{.*#+}} xmm11 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2150 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7]
2151 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,2],xmm12[1,3]
2152 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7
2153 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
2154 ; AVX-NEXT: vandps %ymm1, %ymm11, %ymm1
2155 ; AVX-NEXT: vandnps %ymm7, %ymm11, %ymm7
2156 ; AVX-NEXT: vorps %ymm7, %ymm1, %ymm0
2157 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2158 ; AVX-NEXT: vpsrld $16, %xmm15, %xmm1
2159 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
2160 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
2161 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,6,6]
2162 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
2163 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
2164 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2165 ; AVX-NEXT: vpsrld $16, %xmm11, %xmm7
2166 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2167 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
2168 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
2169 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,2,2,2,4,5,6,7]
2170 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,4]
2171 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm11, %ymm7
2172 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
2173 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
2174 ; AVX-NEXT: vandps %ymm7, %ymm11, %ymm7
2175 ; AVX-NEXT: vorps %ymm1, %ymm7, %ymm1
2176 ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2177 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[0,1,0,1]
2178 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm11[5],xmm7[6,7]
2179 ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm8[0,2],xmm13[1,3]
2180 ; AVX-NEXT: vmovaps %xmm8, %xmm14
2181 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm7, %ymm7
2182 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
2183 ; AVX-NEXT: vandps %ymm1, %ymm11, %ymm1
2184 ; AVX-NEXT: vandnps %ymm7, %ymm11, %ymm7
2185 ; AVX-NEXT: vorps %ymm7, %ymm1, %ymm7
2186 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2187 ; AVX-NEXT: vpsrld $16, %xmm11, %xmm1
2188 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2189 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
2190 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
2191 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
2192 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
2193 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
2194 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
2195 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2196 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
2197 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
2198 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
2199 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
2200 ; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
2201 ; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2
2202 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
2203 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2204 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1]
2205 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
2206 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1]
2207 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0]
2208 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
2209 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2210 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
2211 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
2212 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
2213 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
2214 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
2215 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
2216 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
2217 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
2218 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,3,3]
2219 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2220 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
2221 ; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
2222 ; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2
2223 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
2224 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
2225 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
2226 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
2227 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3]
2228 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
2229 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
2230 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
2231 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2232 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
2233 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
2234 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
2235 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
2236 ; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
2237 ; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2
2238 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
2239 ; AVX-NEXT: vpsrlq $48, %xmm11, %xmm2
2240 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm8[1],xmm2[1]
2241 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,0,1,1]
2242 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2243 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2244 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
2245 ; AVX-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
2246 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2247 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
2248 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
2249 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
2250 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
2251 ; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2
2252 ; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3
2253 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
2254 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1]
2255 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0]
2256 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
2257 ; AVX-NEXT: vpsrld $16, %xmm5, %xmm4
2258 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm4[2],xmm10[3],xmm4[3]
2259 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
2260 ; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[8,9,u,u,u,u,u,u,u,u,6,7,10,11,12,13]
2261 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
2262 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
2263 ; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2
2264 ; AVX-NEXT: vandnps %ymm3, %ymm4, %ymm3
2265 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
2266 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2267 ; AVX-NEXT: vmovaps %ymm2, 96(%rax)
2268 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
2269 ; AVX-NEXT: vmovaps %ymm0, (%rax)
2270 ; AVX-NEXT: vmovaps %ymm7, 128(%rax)
2271 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2272 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
2273 ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
2274 ; AVX-NEXT: vmovaps %ymm0, 160(%rax)
2275 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2276 ; AVX-NEXT: vmovaps %xmm0, 192(%rax)
2277 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2278 ; AVX-NEXT: vmovaps %xmm0, 208(%rax)
2279 ; AVX-NEXT: addq $40, %rsp
2280 ; AVX-NEXT: vzeroupper
2283 ; AVX2-LABEL: store_i16_stride7_vf16:
2285 ; AVX2-NEXT: subq $40, %rsp
2286 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2287 ; AVX2-NEXT: vmovdqa (%rdi), %ymm7
2288 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2289 ; AVX2-NEXT: vmovdqa (%rsi), %ymm6
2290 ; AVX2-NEXT: vmovdqa (%rdx), %ymm5
2291 ; AVX2-NEXT: vmovdqa (%rcx), %ymm13
2292 ; AVX2-NEXT: vmovdqa (%r8), %ymm4
2293 ; AVX2-NEXT: vmovdqa (%r9), %ymm2
2294 ; AVX2-NEXT: vmovdqa (%rax), %ymm1
2295 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2296 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
2297 ; AVX2-NEXT: vpermd %ymm7, %ymm0, %ymm0
2298 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7]
2299 ; AVX2-NEXT: vmovdqa %ymm6, %ymm7
2300 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
2301 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
2302 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2303 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0]
2304 ; AVX2-NEXT: vpermd %ymm5, %ymm8, %ymm8
2305 ; AVX2-NEXT: vmovdqa %ymm5, %ymm6
2306 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
2307 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
2308 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
2309 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2310 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2311 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2312 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4]
2313 ; AVX2-NEXT: vpermd %ymm4, %ymm8, %ymm8
2314 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7]
2315 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
2316 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
2317 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2318 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4]
2319 ; AVX2-NEXT: vpermd %ymm1, %ymm9, %ymm9
2320 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
2321 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2322 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
2323 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2324 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2325 ; AVX2-NEXT: vmovdqa (%rcx), %xmm10
2326 ; AVX2-NEXT: vmovdqa (%rdx), %xmm11
2327 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2328 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
2329 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
2330 ; AVX2-NEXT: vmovdqa (%rdi), %xmm12
2331 ; AVX2-NEXT: vmovdqa (%rsi), %xmm5
2332 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
2333 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
2334 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
2335 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
2336 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2337 ; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm1
2338 ; AVX2-NEXT: vmovdqa (%r9), %xmm9
2339 ; AVX2-NEXT: vmovdqa (%r8), %xmm14
2340 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
2341 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2342 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2343 ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm0
2344 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2345 ; AVX2-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0
2346 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
2347 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
2348 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2349 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm10[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
2350 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[1,1,2,2]
2351 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
2352 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
2353 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
2354 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[1,1,2,3]
2355 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
2356 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2357 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
2358 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2359 ; AVX2-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1
2360 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
2361 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6]
2362 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3]
2363 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
2364 ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm14
2365 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2366 ; AVX2-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9
2367 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
2368 ; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1
2369 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2370 ; AVX2-NEXT: vmovdqa %ymm7, %ymm8
2371 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
2372 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
2373 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2374 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7]
2375 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15]
2376 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
2377 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6]
2378 ; AVX2-NEXT: vmovdqa %ymm6, %ymm9
2379 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm6[3,3,3,3,7,7,7,7]
2380 ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
2381 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
2382 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
2383 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2384 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
2385 ; AVX2-NEXT: vmovdqa %ymm2, %ymm7
2386 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
2387 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
2388 ; AVX2-NEXT: vmovdqa %ymm4, %ymm2
2389 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7]
2390 ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
2391 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
2392 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
2393 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
2394 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2395 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7]
2396 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
2397 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
2398 ; AVX2-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14
2399 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
2400 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2401 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
2402 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2403 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7]
2404 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1]
2405 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
2406 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2407 ; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1
2408 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
2409 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
2410 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
2411 ; AVX2-NEXT: vpbroadcastd (%rax), %ymm10
2412 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
2413 ; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0
2414 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
2415 ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10
2416 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[2,2,2,2,6,6,6,6]
2417 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
2418 ; AVX2-NEXT: vmovdqa %ymm13, %ymm6
2419 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
2420 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,2,2,3]
2421 ; AVX2-NEXT: vmovdqa %ymm3, %ymm0
2422 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
2423 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
2424 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6]
2425 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15]
2426 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2427 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2428 ; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm1, %ymm3
2429 ; AVX2-NEXT: vmovdqa %ymm2, %ymm15
2430 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
2431 ; AVX2-NEXT: vmovdqa %ymm7, %ymm2
2432 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm7[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
2433 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
2434 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm11[2],ymm1[3,4],ymm11[5],ymm1[6,7,8,9],ymm11[10],ymm1[11,12],ymm11[13],ymm1[14,15]
2435 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
2436 ; AVX2-NEXT: vmovdqa %ymm4, %ymm13
2437 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
2438 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
2439 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2440 ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1
2441 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
2442 ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm7
2443 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,1,1,5,5,5,5]
2444 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
2445 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
2446 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
2447 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
2448 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,5,7]
2449 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
2450 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
2451 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
2452 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
2453 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2454 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
2455 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5]
2456 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
2457 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
2458 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
2459 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
2460 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7]
2461 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2462 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2463 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
2464 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
2465 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2466 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2467 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
2468 ; AVX2-NEXT: vmovaps %ymm2, 96(%rax)
2469 ; AVX2-NEXT: vmovdqa %ymm1, 128(%rax)
2470 ; AVX2-NEXT: vmovdqa %ymm7, 160(%rax)
2471 ; AVX2-NEXT: vmovdqa %ymm10, (%rax)
2472 ; AVX2-NEXT: vmovdqa %ymm14, 192(%rax)
2473 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2474 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
2475 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2476 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
2477 ; AVX2-NEXT: addq $40, %rsp
2478 ; AVX2-NEXT: vzeroupper
2481 ; AVX2-FP-LABEL: store_i16_stride7_vf16:
2483 ; AVX2-FP-NEXT: subq $40, %rsp
2484 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2485 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm7
2486 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm6
2487 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm12
2488 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm4
2489 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm3
2490 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2491 ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm13
2492 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1
2493 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2494 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
2495 ; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm0
2496 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
2497 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
2498 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2499 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0]
2500 ; AVX2-FP-NEXT: vpermd %ymm12, %ymm8, %ymm8
2501 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
2502 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
2503 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2504 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2505 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2506 ; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4]
2507 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm8, %ymm8
2508 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
2509 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
2510 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2511 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4]
2512 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm9
2513 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
2514 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2515 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
2516 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2517 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2518 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm10
2519 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm11
2520 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
2521 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
2522 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
2523 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3
2524 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm5
2525 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
2526 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
2527 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
2528 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2529 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm1
2530 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm9
2531 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm14
2532 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
2533 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2534 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
2535 ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm0
2536 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2537 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm8, %ymm0, %ymm0
2538 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
2539 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
2540 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2541 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm3[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
2542 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,2,2]
2543 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
2544 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u]
2545 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,3]
2546 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
2547 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2548 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
2549 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2550 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm1
2551 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
2552 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
2553 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
2554 ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm14
2555 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2556 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm9, %ymm14, %ymm9
2557 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
2558 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm9, %ymm1
2559 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2560 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u]
2561 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm9
2562 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm7[3,3,3,3,7,7,7,7]
2563 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm8
2564 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8,9,10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15]
2565 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u]
2566 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm7
2567 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[3,3,3,3,7,7,7,7]
2568 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
2569 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
2570 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
2571 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2572 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
2573 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2574 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7]
2575 ; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm12
2576 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
2577 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15]
2578 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
2579 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
2580 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
2581 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
2582 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7]
2583 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
2584 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
2585 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm14, %ymm14
2586 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
2587 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2588 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
2589 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
2590 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
2591 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
2592 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2593 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm1
2594 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
2595 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
2596 ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm10
2597 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
2598 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm10, %ymm0
2599 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
2600 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm10
2601 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[2,2,2,2,6,6,6,6]
2602 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
2603 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
2604 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,2,2,3]
2605 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
2606 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
2607 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15]
2608 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
2609 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2610 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm3
2611 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2
2612 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
2613 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm15
2614 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm6[1,2,2,3,5,6,6,7]
2615 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1],ymm1[2],ymm11[3,4],ymm1[5],ymm11[6,7,8,9],ymm1[10],ymm11[11,12],ymm1[13],ymm11[14,15]
2616 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
2617 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6]
2618 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
2619 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2620 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm11, %ymm1
2621 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
2622 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm6
2623 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5]
2624 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23]
2625 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
2626 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
2627 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
2628 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
2629 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
2630 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
2631 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2632 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
2633 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5]
2634 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
2635 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
2636 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
2637 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7]
2638 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
2639 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2640 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
2641 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
2642 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2643 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2644 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
2645 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax)
2646 ; AVX2-FP-NEXT: vmovdqa %ymm1, 128(%rax)
2647 ; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rax)
2648 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax)
2649 ; AVX2-FP-NEXT: vmovdqa %ymm14, 192(%rax)
2650 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2651 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
2652 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2653 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
2654 ; AVX2-FP-NEXT: addq $40, %rsp
2655 ; AVX2-FP-NEXT: vzeroupper
2656 ; AVX2-FP-NEXT: retq
2658 ; AVX2-FCP-LABEL: store_i16_stride7_vf16:
2659 ; AVX2-FCP: # %bb.0:
2660 ; AVX2-FCP-NEXT: pushq %rax
2661 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2662 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5
2663 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm7
2664 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm4
2665 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6
2666 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm15
2667 ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm3
2668 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm1
2669 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6]
2670 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
2671 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
2672 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
2673 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,8,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
2674 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6]
2675 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
2676 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
2677 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2678 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2679 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6]
2680 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8
2681 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
2682 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7]
2683 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
2684 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
2685 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2686 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
2687 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
2688 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2689 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2690 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5]
2691 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23]
2692 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7,8,9],ymm0[10],ymm8[11,12],ymm0[13],ymm8[14,15]
2693 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
2694 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
2695 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7]
2696 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
2697 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
2698 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
2699 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2700 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
2701 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8
2702 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2703 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[0,0,2,1,4,4,6,5]
2704 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
2705 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
2706 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
2707 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2708 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
2709 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
2710 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2711 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2712 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
2713 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0
2714 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
2715 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
2716 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2717 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,3,0,0,0,4,0,0]
2718 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8
2719 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
2720 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
2721 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2722 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
2723 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2724 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm8 = [0,3,0,4]
2725 ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm8, %ymm8
2726 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
2727 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
2728 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2729 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,3,0,0,0,4]
2730 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9
2731 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
2732 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
2733 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
2734 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
2735 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2736 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm10
2737 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12
2738 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
2739 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
2740 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
2741 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13
2742 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm14
2743 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
2744 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
2745 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
2746 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
2747 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm8, %ymm11
2748 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm8
2749 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
2750 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
2751 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2752 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
2753 ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0
2754 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2755 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0
2756 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
2757 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm11, %ymm11
2758 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
2759 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,2,2]
2760 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
2761 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,u,6,7,u,u,u,u,8,9,u,u,u,u]
2762 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[1,1,2,3]
2763 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7]
2764 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
2765 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
2766 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2767 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm1
2768 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
2769 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
2770 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
2771 ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm8
2772 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2773 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2
2774 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
2775 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm8
2776 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u]
2777 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7]
2778 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
2779 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,12,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u]
2780 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7]
2781 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
2782 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
2783 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
2784 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
2785 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
2786 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7]
2787 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
2788 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
2789 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
2790 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
2791 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2792 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7]
2793 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
2794 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
2795 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
2796 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
2797 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2798 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
2799 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2800 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
2801 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
2802 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
2803 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
2804 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
2805 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
2806 ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3
2807 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
2808 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
2809 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
2810 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
2811 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2812 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2813 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
2814 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2815 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax)
2816 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2817 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax)
2818 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
2819 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 192(%rax)
2820 ; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%rax)
2821 ; AVX2-FCP-NEXT: vmovdqa %ymm11, 64(%rax)
2822 ; AVX2-FCP-NEXT: popq %rax
2823 ; AVX2-FCP-NEXT: vzeroupper
2824 ; AVX2-FCP-NEXT: retq
2826 ; AVX512-LABEL: store_i16_stride7_vf16:
2828 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2829 ; AVX512-NEXT: vmovdqa (%rdi), %ymm8
2830 ; AVX512-NEXT: vmovdqa (%rsi), %ymm9
2831 ; AVX512-NEXT: vmovdqa (%rdx), %ymm6
2832 ; AVX512-NEXT: vmovdqa (%rcx), %ymm7
2833 ; AVX512-NEXT: vmovdqa (%r8), %ymm2
2834 ; AVX512-NEXT: vmovdqa (%r9), %ymm3
2835 ; AVX512-NEXT: vmovdqa (%rax), %ymm13
2836 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u]
2837 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u]
2838 ; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm16
2839 ; AVX512-NEXT: vmovdqa (%rcx), %xmm14
2840 ; AVX512-NEXT: vmovdqa (%rdx), %xmm15
2841 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
2842 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
2843 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
2844 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19]
2845 ; AVX512-NEXT: vporq %ymm1, %ymm4, %ymm17
2846 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
2847 ; AVX512-NEXT: vmovdqa (%rsi), %xmm4
2848 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
2849 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
2850 ; AVX512-NEXT: vporq %ymm5, %ymm10, %ymm19
2851 ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm5
2852 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
2853 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u]
2854 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
2855 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7]
2856 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
2857 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
2858 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
2859 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22
2860 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6]
2861 ; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
2862 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
2863 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5]
2864 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
2865 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
2866 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15]
2867 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
2868 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20
2869 ; AVX512-NEXT: vprold $16, %xmm4, %xmm11
2870 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
2871 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
2872 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
2873 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7]
2874 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0]
2875 ; AVX512-NEXT: vpermi2d %zmm12, %zmm11, %zmm18
2876 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
2877 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
2878 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
2879 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15]
2880 ; AVX512-NEXT: vmovdqa (%r9), %xmm11
2881 ; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
2882 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2]
2883 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7]
2884 ; AVX512-NEXT: vmovdqa (%r8), %xmm15
2885 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
2886 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
2887 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
2888 ; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
2889 ; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
2890 ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1
2891 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5]
2892 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
2893 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
2894 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15]
2895 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
2896 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6]
2897 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
2898 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
2899 ; AVX512-NEXT: vpermi2d %zmm11, %zmm0, %zmm15
2900 ; AVX512-NEXT: vprold $16, %ymm3, %ymm0
2901 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7]
2902 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
2903 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7]
2904 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
2905 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
2906 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15]
2907 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7]
2908 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
2909 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6]
2910 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15]
2911 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7
2912 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
2913 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
2914 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7]
2915 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
2916 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3]
2917 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
2918 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2]
2919 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3]
2920 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3]
2921 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
2922 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
2923 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
2924 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
2925 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
2926 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7]
2927 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
2928 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
2929 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
2930 ; AVX512-NEXT: vpermd %zmm13, %zmm3, %zmm3
2931 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7]
2932 ; AVX512-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7
2933 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
2934 ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9
2935 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7))
2936 ; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4
2937 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem)
2938 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9))
2939 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4
2940 ; AVX512-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7]
2941 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4))
2942 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0
2943 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
2944 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7))
2945 ; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5]
2946 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18))
2947 ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1
2948 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm4
2949 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
2950 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15))
2951 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
2952 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8))
2953 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2]
2954 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
2955 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6))
2956 ; AVX512-NEXT: vmovdqa %ymm0, 192(%rcx)
2957 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
2958 ; AVX512-NEXT: vmovdqa64 %zmm3, 128(%rcx)
2959 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx)
2960 ; AVX512-NEXT: vzeroupper
2963 ; AVX512-FCP-LABEL: store_i16_stride7_vf16:
2964 ; AVX512-FCP: # %bb.0:
2965 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
2966 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7
2967 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4
2968 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6
2969 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm1
2970 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2
2971 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
2972 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19]
2973 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16
2974 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm11
2975 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12
2976 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u]
2977 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
2978 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17
2979 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm14
2980 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
2981 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
2982 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18
2983 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
2984 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
2985 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
2986 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
2987 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7]
2988 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15]
2989 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11]
2990 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9
2991 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6]
2992 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
2993 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
2994 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5]
2995 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23]
2996 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15]
2997 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19
2998 ; AVX512-FCP-NEXT: vprold $16, %ymm2, %ymm0
2999 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
3000 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
3001 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5]
3002 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
3003 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
3004 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8
3005 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10]
3006 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13
3007 ; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm0
3008 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
3009 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
3010 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
3011 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
3012 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9]
3013 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15
3014 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
3015 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
3016 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
3017 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
3018 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
3019 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm14
3020 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3021 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
3022 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
3023 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11
3024 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8
3025 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
3026 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12
3027 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3028 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10
3029 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
3030 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
3031 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
3032 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
3033 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11]
3034 ; AVX512-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20
3035 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
3036 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u]
3037 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10
3038 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u]
3039 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7]
3040 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
3041 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u]
3042 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7]
3043 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15]
3044 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
3045 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3046 ; AVX512-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6
3047 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7]
3048 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7
3049 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
3050 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
3051 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
3052 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3053 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1]
3054 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
3055 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3056 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
3057 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
3058 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
3059 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3060 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx
3061 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2
3062 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0
3063 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
3064 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2
3065 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem)
3066 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0))
3067 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7]
3068 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9))
3069 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13))
3070 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0))
3071 ; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5]
3072 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15))
3073 ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2
3074 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3
3075 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
3076 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20))
3077 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
3078 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5))
3079 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7))
3080 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4))
3081 ; AVX512-FCP-NEXT: vmovdqa %ymm1, 192(%rcx)
3082 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3083 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
3084 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx)
3085 ; AVX512-FCP-NEXT: vzeroupper
3086 ; AVX512-FCP-NEXT: retq
3088 ; AVX512DQ-LABEL: store_i16_stride7_vf16:
3089 ; AVX512DQ: # %bb.0:
3090 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3091 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
3092 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9
3093 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm6
3094 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7
3095 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2
3096 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3
3097 ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm13
3098 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u],zero,zero,ymm7[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm7[16,17,u,u,u,u,u,u,u,u]
3099 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[u,u,u,u,u,u,14,15],zero,zero,ymm6[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm6[u,u,u,u,u,u,u,u]
3100 ; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm16
3101 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm14
3102 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm15
3103 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
3104 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21
3105 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
3106 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[12,13,14,15],zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm8[u,u,u,u,u,u,u,u,16,17,18,19]
3107 ; AVX512DQ-NEXT: vporq %ymm1, %ymm4, %ymm17
3108 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
3109 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4
3110 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[16,17,u,u],zero,zero
3111 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm2[u,u,u,u]
3112 ; AVX512DQ-NEXT: vporq %ymm5, %ymm10, %ymm19
3113 ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm5
3114 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5
3115 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[16,17,u,u]
3116 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
3117 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,1,1,3,4,5,5,7]
3118 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
3119 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
3120 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
3121 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22
3122 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[2,2,2,2,6,6,6,6]
3123 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
3124 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
3125 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[1,1,1,1,5,5,5,5]
3126 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
3127 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
3128 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7,8,9],ymm10[10],ymm11[11,12],ymm10[13],ymm11[14,15]
3129 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
3130 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm20
3131 ; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm11
3132 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
3133 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
3134 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
3135 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,2,4,5,6,7]
3136 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,0,0,17,17,0,0,0,0,0,1,0,0,2,0]
3137 ; AVX512DQ-NEXT: vpermi2d %zmm12, %zmm11, %zmm18
3138 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
3139 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
3140 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
3141 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15]
3142 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm11
3143 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
3144 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[1,1,2,2]
3145 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3],xmm14[4],xmm15[5,6],xmm14[7]
3146 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm15
3147 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
3148 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm15[4],xmm11[4],xmm15[5],xmm11[5],xmm15[6],xmm11[6],xmm15[7],xmm11[7]
3149 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
3150 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
3151 ; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm0
3152 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm1
3153 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,0,2,1,4,4,6,5]
3154 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
3155 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[0,0,0,0,4,4,4,4]
3156 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5],ymm0[6],ymm14[7,8,9,10],ymm0[11],ymm14[12,13],ymm0[14],ymm14[15]
3157 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
3158 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6]
3159 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
3160 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
3161 ; AVX512DQ-NEXT: vpermi2d %zmm11, %zmm0, %zmm15
3162 ; AVX512DQ-NEXT: vprold $16, %ymm3, %ymm0
3163 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7]
3164 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7,8,9],ymm0[10],ymm11[11,12],ymm0[13],ymm11[14,15]
3165 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,3,3,3,7,7,7,7]
3166 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
3167 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
3168 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15]
3169 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,3,3,3,7,7,7,7]
3170 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
3171 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6]
3172 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14,15]
3173 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7
3174 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
3175 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,2,1]
3176 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm10[2,1,2,3,4,5,6,7]
3177 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
3178 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,3]
3179 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
3180 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm22[2,1,3,2]
3181 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm12[0,2,2,3]
3182 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,2,3,3]
3183 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
3184 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
3185 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
3186 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
3187 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
3188 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7]
3189 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
3190 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
3191 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
3192 ; AVX512DQ-NEXT: vpermd %zmm13, %zmm3, %zmm3
3193 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,3,3,6,7,7,7]
3194 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm7, %zmm7
3195 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx
3196 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm9, %zmm9
3197 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = zmm7 ^ (mem & (zmm9 ^ zmm7))
3198 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm4, %zmm4
3199 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm4 & mem)
3200 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm9))
3201 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm10, %zmm4
3202 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm7 = zmm20[2,2,2,3,6,6,6,7]
3203 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm4 ^ (mem & (zmm7 ^ zmm4))
3204 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm12, %zmm0
3205 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
3206 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm7))
3207 ; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm1[0,0,1,1,4,4,5,5]
3208 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm0 = zmm18 ^ (mem & (zmm0 ^ zmm18))
3209 ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1
3210 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm4
3211 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
3212 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm15))
3213 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm0))
3214 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm8))
3215 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm13[2,1,3,2]
3216 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
3217 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm6))
3218 ; AVX512DQ-NEXT: vmovdqa %ymm0, 192(%rcx)
3219 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
3220 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rcx)
3221 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx)
3222 ; AVX512DQ-NEXT: vzeroupper
3223 ; AVX512DQ-NEXT: retq
3225 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf16:
3226 ; AVX512DQ-FCP: # %bb.0:
3227 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
3228 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7
3229 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4
3230 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6
3231 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm1
3232 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2
3233 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
3234 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,14,15],zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm5[u,u,u,u,u,u,u,u,16,17,18,19]
3235 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm16
3236 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm11
3237 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12
3238 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u],zero,zero,ymm6[14,15,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[16,17,u,u,u,u,u,u,u,u]
3239 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,14,15],zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm4[u,u,u,u,u,u,u,u]
3240 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17
3241 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm14
3242 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u],zero,zero
3243 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,12,13,14,15],zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,16,17],zero,zero,ymm1[u,u,u,u]
3244 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm18
3245 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,u,u,u,u,26,27,u,u,u,u,u,u]
3246 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[2,2,2,2,6,6,6,6]
3247 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
3248 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
3249 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7]
3250 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15]
3251 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,0,3,2,0,10,10,11]
3252 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm9
3253 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6]
3254 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
3255 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
3256 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,1,1,5,5,5,5]
3257 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,u,u,20,21,24,25,u,u,22,23,22,23]
3258 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7,8,9],ymm3[10],ymm10[11,12],ymm3[13],ymm10[14,15]
3259 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19
3260 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm2, %ymm0
3261 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
3262 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7,8,9],ymm0[10],ymm3[11,12],ymm0[13],ymm3[14,15]
3263 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,0,2,1,4,4,6,5]
3264 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
3265 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7,8,9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
3266 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8
3267 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,3,3,10,9,11,10]
3268 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm13
3269 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm0
3270 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,3]
3271 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4],xmm0[5],xmm3[6,7]
3272 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
3273 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
3274 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,0,0,8,0,9]
3275 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm15
3276 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
3277 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
3278 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
3279 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
3280 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
3281 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm14
3282 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
3283 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
3284 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
3285 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm11, %zmm11
3286 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8
3287 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
3288 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12
3289 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3290 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10
3291 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
3292 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3]
3293 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
3294 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
3295 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,1,8,9,9,11]
3296 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm14, %zmm8, %zmm20
3297 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3]
3298 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[16,17,u,u]
3299 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm10, %zmm10
3300 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,14,15,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u]
3301 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[3,3,3,3,7,7,7,7]
3302 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
3303 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,u,u,u,u,30,31,u,u,u,u,u,u]
3304 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,3,3,3,7,7,7,7]
3305 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13,14,15]
3306 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
3307 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3308 ; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm6, %zmm6
3309 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,0,0,0,7,0,0,7]
3310 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7
3311 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
3312 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
3313 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
3314 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
3315 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[0,0,1,1]
3316 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
3317 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3318 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
3319 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
3320 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
3321 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
3322 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rcx
3323 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm3, %zmm2
3324 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm0
3325 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
3326 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm8, %zmm2
3327 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm2 & mem)
3328 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm0))
3329 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm19[2,2,2,3,6,6,6,7]
3330 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm9 ^ (mem & (zmm0 ^ zmm9))
3331 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm13))
3332 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm0))
3333 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm11[0,0,1,1,4,4,5,5]
3334 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm15 ^ (mem & (zmm0 ^ zmm15))
3335 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2
3336 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3
3337 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
3338 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm20))
3339 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
3340 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm5))
3341 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm7 ^ (mem & (ymm1 ^ ymm7))
3342 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm4))
3343 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, 192(%rcx)
3344 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3345 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
3346 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx)
3347 ; AVX512DQ-FCP-NEXT: vzeroupper
3348 ; AVX512DQ-FCP-NEXT: retq
3350 ; AVX512BW-LABEL: store_i16_stride7_vf16:
3351 ; AVX512BW: # %bb.0:
3352 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3353 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3354 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
3355 ; AVX512BW-NEXT: vmovdqa (%rdx), %ymm1
3356 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm2
3357 ; AVX512BW-NEXT: vmovdqa (%r10), %ymm3
3358 ; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
3359 ; AVX512BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
3360 ; AVX512BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
3361 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0]
3362 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3363 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47]
3364 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3365 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0]
3366 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3367 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52]
3368 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3369 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
3370 ; AVX512BW-NEXT: kmovd %ecx, %k1
3371 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1}
3372 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0]
3373 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3374 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9]
3375 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3376 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
3377 ; AVX512BW-NEXT: kmovd %ecx, %k1
3378 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1}
3379 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13]
3380 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3381 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0]
3382 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3383 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
3384 ; AVX512BW-NEXT: kmovd %ecx, %k1
3385 ; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1}
3386 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rax)
3387 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rax)
3388 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax)
3389 ; AVX512BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
3390 ; AVX512BW-NEXT: kmovd %ecx, %k1
3391 ; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
3392 ; AVX512BW-NEXT: vmovdqa %ymm4, 192(%rax)
3393 ; AVX512BW-NEXT: vzeroupper
3394 ; AVX512BW-NEXT: retq
3396 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf16:
3397 ; AVX512BW-FCP: # %bb.0:
3398 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3399 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3400 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
3401 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm1
3402 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm2
3403 ; AVX512BW-FCP-NEXT: vmovdqa (%r10), %ymm3
3404 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
3405 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
3406 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
3407 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0]
3408 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3409 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47]
3410 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3411 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0]
3412 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3413 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52]
3414 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3415 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
3416 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
3417 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1}
3418 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0]
3419 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3420 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9]
3421 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3422 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
3423 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
3424 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1}
3425 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13]
3426 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3427 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0]
3428 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3429 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
3430 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
3431 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1}
3432 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
3433 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax)
3434 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
3435 ; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
3436 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
3437 ; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
3438 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, 192(%rax)
3439 ; AVX512BW-FCP-NEXT: vzeroupper
3440 ; AVX512BW-FCP-NEXT: retq
3442 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf16:
3443 ; AVX512DQ-BW: # %bb.0:
3444 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3445 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3446 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
3447 ; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm1
3448 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm2
3449 ; AVX512DQ-BW-NEXT: vmovdqa (%r10), %ymm3
3450 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
3451 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
3452 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
3453 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0]
3454 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3455 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47]
3456 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3457 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0]
3458 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3459 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52]
3460 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3461 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
3462 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
3463 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1}
3464 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0]
3465 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3466 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9]
3467 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3468 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
3469 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
3470 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1}
3471 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13]
3472 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3473 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0]
3474 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3475 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
3476 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
3477 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1}
3478 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rax)
3479 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rax)
3480 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax)
3481 ; AVX512DQ-BW-NEXT: movw $-7741, %cx # imm = 0xE1C3
3482 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
3483 ; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
3484 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, 192(%rax)
3485 ; AVX512DQ-BW-NEXT: vzeroupper
3486 ; AVX512DQ-BW-NEXT: retq
3488 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf16:
3489 ; AVX512DQ-BW-FCP: # %bb.0:
3490 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3491 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3492 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
3493 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm1
3494 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm2
3495 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %ymm3
3496 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
3497 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rcx), %zmm1, %zmm1
3498 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%r9), %zmm2, %zmm2
3499 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,14,30,46,62,0,0,0,15,31,47,63,0,0,0]
3500 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3501 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [29,45,0,0,0,0,14,30,46,0,0,0,0,15,31,47]
3502 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3503 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,0,0,16,32,0,0,0,0,1,17,33,0,0,0,0,2,18,34,0,0,0,0,3,19,35,0,0,0,0]
3504 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3505 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,16,32,48,0,0,0,1,17,33,49,0,0,0,2,18,34,50,0,0,0,3,19,35,51,0,0,0,4,20,36,52]
3506 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3507 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
3508 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
3509 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1}
3510 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [4,20,36,0,0,0,0,5,21,37,0,0,0,0,6,22,38,0,0,0,0,7,23,39,0,0,0,0,8,24,40,0]
3511 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3512 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,0,5,21,37,53,0,0,0,6,22,38,54,0,0,0,7,23,39,55,0,0,0,8,24,40,56,0,0,0,9]
3513 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3514 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
3515 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
3516 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm8 {%k1}
3517 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,0,9,25,41,0,0,0,0,10,26,42,0,0,0,0,11,27,43,0,0,0,0,12,28,44,0,0,0,0,13]
3518 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3519 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [57,9,25,0,0,0,42,58,10,26,0,0,0,43,59,11,27,0,0,0,44,60,12,28,0,0,0,45,61,13,29,0]
3520 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3521 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
3522 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
3523 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm2 {%k1}
3524 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
3525 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax)
3526 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
3527 ; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3
3528 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
3529 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1}
3530 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, 192(%rax)
3531 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3532 ; AVX512DQ-BW-FCP-NEXT: retq
3533 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64
3534 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64
3535 %in.vec2 = load <16 x i16>, ptr %in.vecptr2, align 64
3536 %in.vec3 = load <16 x i16>, ptr %in.vecptr3, align 64
3537 %in.vec4 = load <16 x i16>, ptr %in.vecptr4, align 64
3538 %in.vec5 = load <16 x i16>, ptr %in.vecptr5, align 64
3539 %in.vec6 = load <16 x i16>, ptr %in.vecptr6, align 64
3540 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3541 %2 = shufflevector <16 x i16> %in.vec2, <16 x i16> %in.vec3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3542 %3 = shufflevector <16 x i16> %in.vec4, <16 x i16> %in.vec5, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
3543 %4 = shufflevector <32 x i16> %1, <32 x i16> %2, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
3544 %5 = shufflevector <16 x i16> %in.vec6, <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3545 %6 = shufflevector <32 x i16> %3, <32 x i16> %5, <48 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
3546 %7 = shufflevector <48 x i16> %6, <48 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
3547 %8 = shufflevector <64 x i16> %4, <64 x i16> %7, <112 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111>
3548 %interleaved.vec = shufflevector <112 x i16> %8, <112 x i16> poison, <112 x i32> <i32 0, i32 16, i32 32, i32 48, i32 64, i32 80, i32 96, i32 1, i32 17, i32 33, i32 49, i32 65, i32 81, i32 97, i32 2, i32 18, i32 34, i32 50, i32 66, i32 82, i32 98, i32 3, i32 19, i32 35, i32 51, i32 67, i32 83, i32 99, i32 4, i32 20, i32 36, i32 52, i32 68, i32 84, i32 100, i32 5, i32 21, i32 37, i32 53, i32 69, i32 85, i32 101, i32 6, i32 22, i32 38, i32 54, i32 70, i32 86, i32 102, i32 7, i32 23, i32 39, i32 55, i32 71, i32 87, i32 103, i32 8, i32 24, i32 40, i32 56, i32 72, i32 88, i32 104, i32 9, i32 25, i32 41, i32 57, i32 73, i32 89, i32 105, i32 10, i32 26, i32 42, i32 58, i32 74, i32 90, i32 106, i32 11, i32 27, i32 43, i32 59, i32 75, i32 91, i32 107, i32 12, i32 28, i32 44, i32 60, i32 76, i32 92, i32 108, i32 13, i32 29, i32 45, i32 61, i32 77, i32 93, i32 109, i32 14, i32 30, i32 46, i32 62, i32 78, i32 94, i32 110, i32 15, i32 31, i32 47, i32 63, i32 79, i32 95, i32 111>
3549 store <112 x i16> %interleaved.vec, ptr %out.vec, align 64
3553 define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
3554 ; SSE-LABEL: store_i16_stride7_vf32:
3556 ; SSE-NEXT: subq $680, %rsp # imm = 0x2A8
3557 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3558 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
3559 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3560 ; SSE-NEXT: movdqa 48(%rsi), %xmm2
3561 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3562 ; SSE-NEXT: movdqa 48(%rdx), %xmm10
3563 ; SSE-NEXT: movdqa 48(%rcx), %xmm11
3564 ; SSE-NEXT: movdqa 48(%r8), %xmm9
3565 ; SSE-NEXT: movdqa 48(%r9), %xmm4
3566 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3567 ; SSE-NEXT: movaps 48(%rax), %xmm7
3568 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
3569 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3570 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,0,65535]
3571 ; SSE-NEXT: movdqa %xmm6, %xmm1
3572 ; SSE-NEXT: pandn %xmm0, %xmm1
3573 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
3574 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3575 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
3576 ; SSE-NEXT: pand %xmm6, %xmm0
3577 ; SSE-NEXT: por %xmm1, %xmm0
3578 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3579 ; SSE-NEXT: movdqa %xmm3, %xmm1
3580 ; SSE-NEXT: movdqa %xmm3, %xmm5
3581 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3582 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
3583 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
3584 ; SSE-NEXT: pand %xmm3, %xmm0
3585 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
3586 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3587 ; SSE-NEXT: pandn %xmm2, %xmm3
3588 ; SSE-NEXT: por %xmm0, %xmm3
3589 ; SSE-NEXT: movdqa %xmm4, %xmm0
3590 ; SSE-NEXT: psrld $16, %xmm0
3591 ; SSE-NEXT: movdqa %xmm0, %xmm2
3592 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
3593 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[0,2]
3594 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
3595 ; SSE-NEXT: andps %xmm1, %xmm2
3596 ; SSE-NEXT: andnps %xmm7, %xmm1
3597 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3598 ; SSE-NEXT: orps %xmm2, %xmm1
3599 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; SSE-NEXT: movdqa %xmm11, %xmm1
3601 ; SSE-NEXT: psrlq $48, %xmm1
3602 ; SSE-NEXT: movdqa %xmm10, %xmm2
3603 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
3604 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,65535,65535,65535]
3605 ; SSE-NEXT: pandn %xmm2, %xmm1
3606 ; SSE-NEXT: movdqa %xmm5, %xmm2
3607 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3608 ; SSE-NEXT: por %xmm1, %xmm2
3609 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,3,3]
3610 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
3611 ; SSE-NEXT: movdqa %xmm1, %xmm4
3612 ; SSE-NEXT: movdqa %xmm1, %xmm8
3613 ; SSE-NEXT: pandn %xmm3, %xmm4
3614 ; SSE-NEXT: por %xmm2, %xmm4
3615 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0]
3616 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,0]
3617 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
3618 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
3619 ; SSE-NEXT: movdqa %xmm1, %xmm3
3620 ; SSE-NEXT: pandn %xmm0, %xmm3
3621 ; SSE-NEXT: andps %xmm1, %xmm2
3622 ; SSE-NEXT: por %xmm2, %xmm3
3623 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3624 ; SSE-NEXT: movdqa (%rax), %xmm7
3625 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1]
3626 ; SSE-NEXT: movdqa %xmm8, %xmm2
3627 ; SSE-NEXT: pandn %xmm0, %xmm2
3628 ; SSE-NEXT: movdqa (%r8), %xmm0
3629 ; SSE-NEXT: movdqa (%r9), %xmm13
3630 ; SSE-NEXT: movdqa %xmm0, %xmm3
3631 ; SSE-NEXT: movdqa %xmm0, %xmm12
3632 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3633 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
3634 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3635 ; SSE-NEXT: movdqa %xmm3, %xmm0
3636 ; SSE-NEXT: movdqa %xmm3, %xmm5
3637 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3638 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3639 ; SSE-NEXT: pand %xmm8, %xmm0
3640 ; SSE-NEXT: por %xmm2, %xmm0
3641 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
3642 ; SSE-NEXT: movdqa %xmm1, %xmm2
3643 ; SSE-NEXT: movdqa %xmm1, %xmm10
3644 ; SSE-NEXT: pandn %xmm0, %xmm2
3645 ; SSE-NEXT: movdqa (%rcx), %xmm1
3646 ; SSE-NEXT: movdqa %xmm1, %xmm0
3647 ; SSE-NEXT: psrld $16, %xmm0
3648 ; SSE-NEXT: movdqa (%rdx), %xmm4
3649 ; SSE-NEXT: movdqa %xmm4, %xmm3
3650 ; SSE-NEXT: movdqa %xmm4, %xmm14
3651 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
3652 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,0,0,65535,65535,65535,65535,65535]
3653 ; SSE-NEXT: movdqa %xmm15, %xmm4
3654 ; SSE-NEXT: pandn %xmm3, %xmm4
3655 ; SSE-NEXT: movdqa (%rdi), %xmm11
3656 ; SSE-NEXT: movdqa (%rsi), %xmm9
3657 ; SSE-NEXT: movdqa %xmm9, %xmm3
3658 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3659 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
3660 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3661 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
3662 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
3663 ; SSE-NEXT: pand %xmm15, %xmm3
3664 ; SSE-NEXT: por %xmm4, %xmm3
3665 ; SSE-NEXT: pand %xmm10, %xmm3
3666 ; SSE-NEXT: por %xmm2, %xmm3
3667 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3668 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
3669 ; SSE-NEXT: pandn %xmm7, %xmm2
3670 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3671 ; SSE-NEXT: movdqa %xmm5, %xmm3
3672 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3673 ; SSE-NEXT: por %xmm2, %xmm3
3674 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535]
3675 ; SSE-NEXT: movdqa %xmm0, %xmm2
3676 ; SSE-NEXT: pandn %xmm3, %xmm2
3677 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3678 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[2,2,2,2]
3679 ; SSE-NEXT: movdqa %xmm6, %xmm4
3680 ; SSE-NEXT: pandn %xmm3, %xmm4
3681 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4]
3682 ; SSE-NEXT: pand %xmm6, %xmm3
3683 ; SSE-NEXT: por %xmm4, %xmm3
3684 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
3685 ; SSE-NEXT: movdqa %xmm11, %xmm4
3686 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
3687 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[3,3,3,3,4,5,6,7]
3688 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3689 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
3690 ; SSE-NEXT: andps %xmm0, %xmm3
3691 ; SSE-NEXT: orps %xmm2, %xmm3
3692 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3693 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3]
3694 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
3695 ; SSE-NEXT: movdqa %xmm0, %xmm3
3696 ; SSE-NEXT: pandn %xmm2, %xmm3
3697 ; SSE-NEXT: movdqa %xmm13, %xmm2
3698 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
3699 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
3700 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
3701 ; SSE-NEXT: pand %xmm0, %xmm2
3702 ; SSE-NEXT: por %xmm3, %xmm2
3703 ; SSE-NEXT: movdqa %xmm14, %xmm4
3704 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
3705 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3706 ; SSE-NEXT: movdqa %xmm11, %xmm3
3707 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3]
3708 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3]
3709 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3710 ; SSE-NEXT: movdqa 16(%rax), %xmm7
3711 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1]
3712 ; SSE-NEXT: movdqa %xmm8, %xmm5
3713 ; SSE-NEXT: movdqa %xmm8, %xmm3
3714 ; SSE-NEXT: pandn %xmm2, %xmm3
3715 ; SSE-NEXT: movdqa 16(%r8), %xmm10
3716 ; SSE-NEXT: movdqa 16(%r9), %xmm8
3717 ; SSE-NEXT: movdqa %xmm10, %xmm4
3718 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3719 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
3720 ; SSE-NEXT: movdqa %xmm8, (%rsp) # 16-byte Spill
3721 ; SSE-NEXT: movdqa %xmm4, %xmm2
3722 ; SSE-NEXT: movdqa %xmm4, %xmm12
3723 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3724 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3725 ; SSE-NEXT: pand %xmm5, %xmm2
3726 ; SSE-NEXT: por %xmm3, %xmm2
3727 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
3728 ; SSE-NEXT: movdqa %xmm1, %xmm3
3729 ; SSE-NEXT: pandn %xmm2, %xmm3
3730 ; SSE-NEXT: movdqa 16(%rcx), %xmm5
3731 ; SSE-NEXT: movdqa %xmm5, %xmm2
3732 ; SSE-NEXT: psrld $16, %xmm2
3733 ; SSE-NEXT: movdqa 16(%rdx), %xmm0
3734 ; SSE-NEXT: movdqa %xmm0, %xmm4
3735 ; SSE-NEXT: movdqa %xmm0, %xmm9
3736 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
3737 ; SSE-NEXT: movdqa %xmm15, %xmm2
3738 ; SSE-NEXT: pandn %xmm4, %xmm2
3739 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3740 ; SSE-NEXT: movdqa 16(%rsi), %xmm13
3741 ; SSE-NEXT: movdqa %xmm13, %xmm4
3742 ; SSE-NEXT: movdqa %xmm13, %xmm14
3743 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3744 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
3745 ; SSE-NEXT: movdqa %xmm0, %xmm13
3746 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3747 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
3748 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
3749 ; SSE-NEXT: pand %xmm15, %xmm4
3750 ; SSE-NEXT: por %xmm2, %xmm4
3751 ; SSE-NEXT: pand %xmm1, %xmm4
3752 ; SSE-NEXT: por %xmm3, %xmm4
3753 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3754 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
3755 ; SSE-NEXT: pandn %xmm7, %xmm2
3756 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757 ; SSE-NEXT: movdqa %xmm12, %xmm3
3758 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3759 ; SSE-NEXT: por %xmm2, %xmm3
3760 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,0,65535,65535,65535,65535]
3761 ; SSE-NEXT: movdqa %xmm0, %xmm2
3762 ; SSE-NEXT: pandn %xmm3, %xmm2
3763 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3764 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[2,2,2,2]
3765 ; SSE-NEXT: movdqa %xmm6, %xmm4
3766 ; SSE-NEXT: pandn %xmm3, %xmm4
3767 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4]
3768 ; SSE-NEXT: pand %xmm6, %xmm3
3769 ; SSE-NEXT: por %xmm4, %xmm3
3770 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
3771 ; SSE-NEXT: movdqa %xmm13, %xmm4
3772 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
3773 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7]
3774 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3775 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
3776 ; SSE-NEXT: andps %xmm0, %xmm3
3777 ; SSE-NEXT: orps %xmm2, %xmm3
3778 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3779 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3]
3780 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
3781 ; SSE-NEXT: movdqa %xmm0, %xmm3
3782 ; SSE-NEXT: pandn %xmm2, %xmm3
3783 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
3784 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,2,2,2,4,5,6,7]
3785 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
3786 ; SSE-NEXT: pand %xmm0, %xmm2
3787 ; SSE-NEXT: por %xmm3, %xmm2
3788 ; SSE-NEXT: movdqa %xmm9, %xmm4
3789 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3790 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3791 ; SSE-NEXT: movdqa %xmm13, %xmm3
3792 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,2],xmm4[2,3]
3793 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,2],xmm2[0,3]
3794 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3795 ; SSE-NEXT: movdqa 32(%rax), %xmm7
3796 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,1,0,1]
3797 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535]
3798 ; SSE-NEXT: movdqa %xmm5, %xmm3
3799 ; SSE-NEXT: pandn %xmm2, %xmm3
3800 ; SSE-NEXT: movdqa 32(%r8), %xmm10
3801 ; SSE-NEXT: movdqa 32(%r9), %xmm9
3802 ; SSE-NEXT: movdqa %xmm10, %xmm4
3803 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3804 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
3805 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3806 ; SSE-NEXT: movdqa %xmm4, %xmm2
3807 ; SSE-NEXT: movdqa %xmm4, %xmm14
3808 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3809 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3810 ; SSE-NEXT: pand %xmm5, %xmm2
3811 ; SSE-NEXT: por %xmm3, %xmm2
3812 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,0,0,65535,65535]
3813 ; SSE-NEXT: movdqa %xmm5, %xmm3
3814 ; SSE-NEXT: pandn %xmm2, %xmm3
3815 ; SSE-NEXT: movdqa 32(%rcx), %xmm12
3816 ; SSE-NEXT: movdqa %xmm12, %xmm2
3817 ; SSE-NEXT: psrld $16, %xmm2
3818 ; SSE-NEXT: movdqa 32(%rdx), %xmm0
3819 ; SSE-NEXT: movdqa %xmm0, %xmm4
3820 ; SSE-NEXT: movdqa %xmm0, %xmm8
3821 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
3822 ; SSE-NEXT: movdqa %xmm15, %xmm2
3823 ; SSE-NEXT: pandn %xmm4, %xmm2
3824 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
3825 ; SSE-NEXT: movdqa 32(%rsi), %xmm0
3826 ; SSE-NEXT: movdqa %xmm0, %xmm4
3827 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3828 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
3829 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3830 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
3831 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
3832 ; SSE-NEXT: pand %xmm15, %xmm4
3833 ; SSE-NEXT: por %xmm2, %xmm4
3834 ; SSE-NEXT: pand %xmm5, %xmm4
3835 ; SSE-NEXT: por %xmm3, %xmm4
3836 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3837 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,0,65535,65535,65535,65535]
3838 ; SSE-NEXT: pandn %xmm7, %xmm3
3839 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3840 ; SSE-NEXT: movdqa %xmm14, %xmm2
3841 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3842 ; SSE-NEXT: por %xmm3, %xmm2
3843 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,4,4,4]
3844 ; SSE-NEXT: pand %xmm6, %xmm3
3845 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3846 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,2,2,2]
3847 ; SSE-NEXT: pandn %xmm4, %xmm6
3848 ; SSE-NEXT: por %xmm3, %xmm6
3849 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3850 ; SSE-NEXT: movdqa %xmm1, %xmm3
3851 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3852 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm6[3,3]
3853 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[3,3,3,3,4,5,6,7]
3854 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
3855 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,0,0,65535,65535,65535,65535]
3856 ; SSE-NEXT: andps %xmm5, %xmm4
3857 ; SSE-NEXT: andnps %xmm2, %xmm5
3858 ; SSE-NEXT: orps %xmm4, %xmm5
3859 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3860 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,2,3]
3861 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
3862 ; SSE-NEXT: movdqa %xmm0, %xmm3
3863 ; SSE-NEXT: pandn %xmm2, %xmm3
3864 ; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
3865 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[2,2,2,2,4,5,6,7]
3866 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
3867 ; SSE-NEXT: pand %xmm0, %xmm2
3868 ; SSE-NEXT: movdqa %xmm0, %xmm9
3869 ; SSE-NEXT: por %xmm3, %xmm2
3870 ; SSE-NEXT: movdqa %xmm8, %xmm0
3871 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
3872 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3873 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm0[2,3]
3874 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm2[0,3]
3875 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3876 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3877 ; SSE-NEXT: movdqa %xmm1, %xmm2
3878 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3879 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
3880 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3881 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3882 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,65535,65535]
3883 ; SSE-NEXT: pand %xmm10, %xmm2
3884 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3885 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,1,0,1]
3886 ; SSE-NEXT: pandn %xmm3, %xmm10
3887 ; SSE-NEXT: por %xmm2, %xmm10
3888 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3889 ; SSE-NEXT: movdqa %xmm5, %xmm6
3890 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3891 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
3892 ; SSE-NEXT: movdqa %xmm5, %xmm2
3893 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
3894 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3895 ; SSE-NEXT: psrld $16, %xmm3
3896 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
3897 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3898 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3899 ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
3900 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
3901 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
3902 ; SSE-NEXT: pand %xmm15, %xmm3
3903 ; SSE-NEXT: pandn %xmm2, %xmm15
3904 ; SSE-NEXT: por %xmm3, %xmm15
3905 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
3906 ; SSE-NEXT: pand %xmm2, %xmm15
3907 ; SSE-NEXT: pandn %xmm10, %xmm2
3908 ; SSE-NEXT: por %xmm15, %xmm2
3909 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3910 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
3911 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm6[2,0]
3912 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3913 ; SSE-NEXT: pslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9]
3914 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535]
3915 ; SSE-NEXT: movdqa %xmm4, %xmm2
3916 ; SSE-NEXT: pandn %xmm6, %xmm2
3917 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3918 ; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
3919 ; SSE-NEXT: pand %xmm4, %xmm3
3920 ; SSE-NEXT: por %xmm2, %xmm3
3921 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
3922 ; SSE-NEXT: movdqa %xmm1, %xmm2
3923 ; SSE-NEXT: pandn %xmm0, %xmm2
3924 ; SSE-NEXT: pand %xmm1, %xmm3
3925 ; SSE-NEXT: por %xmm3, %xmm2
3926 ; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
3927 ; SSE-NEXT: movdqa %xmm9, %xmm3
3928 ; SSE-NEXT: pandn %xmm8, %xmm3
3929 ; SSE-NEXT: pand %xmm9, %xmm2
3930 ; SSE-NEXT: por %xmm2, %xmm3
3931 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,65535,65535]
3932 ; SSE-NEXT: pand %xmm1, %xmm3
3933 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,3,3]
3934 ; SSE-NEXT: pandn %xmm0, %xmm1
3935 ; SSE-NEXT: por %xmm3, %xmm1
3936 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3937 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3938 ; SSE-NEXT: movdqa %xmm8, %xmm0
3939 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3940 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
3941 ; SSE-NEXT: psrlq $48, %xmm15
3942 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm15[1]
3943 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
3944 ; SSE-NEXT: movdqa %xmm6, %xmm2
3945 ; SSE-NEXT: pandn %xmm0, %xmm2
3946 ; SSE-NEXT: movdqa %xmm11, %xmm0
3947 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3948 ; SSE-NEXT: por %xmm2, %xmm0
3949 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3950 ; SSE-NEXT: movdqa %xmm1, %xmm3
3951 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3952 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3953 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3954 ; SSE-NEXT: psrld $16, %xmm2
3955 ; SSE-NEXT: movdqa %xmm1, %xmm3
3956 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3957 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3958 ; SSE-NEXT: movdqa %xmm5, %xmm2
3959 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3960 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
3961 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1]
3962 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4]
3963 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [0,65535,65535,65535,65535,0,0,0]
3964 ; SSE-NEXT: movdqa %xmm15, %xmm1
3965 ; SSE-NEXT: pandn %xmm2, %xmm1
3966 ; SSE-NEXT: pand %xmm15, %xmm0
3967 ; SSE-NEXT: por %xmm0, %xmm1
3968 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3969 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3970 ; SSE-NEXT: movdqa %xmm9, %xmm0
3971 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3972 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
3973 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3974 ; SSE-NEXT: psrlq $48, %xmm14
3975 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm14[1]
3976 ; SSE-NEXT: movdqa %xmm6, %xmm2
3977 ; SSE-NEXT: pandn %xmm0, %xmm2
3978 ; SSE-NEXT: movdqa %xmm13, %xmm0
3979 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3980 ; SSE-NEXT: por %xmm2, %xmm0
3981 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3982 ; SSE-NEXT: movdqa %xmm1, %xmm3
3983 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
3984 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
3985 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3986 ; SSE-NEXT: psrld $16, %xmm2
3987 ; SSE-NEXT: movdqa %xmm1, %xmm3
3988 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3989 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3990 ; SSE-NEXT: movdqa %xmm10, %xmm2
3991 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
3992 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
3993 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1]
3994 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4]
3995 ; SSE-NEXT: movdqa %xmm15, %xmm1
3996 ; SSE-NEXT: pandn %xmm2, %xmm1
3997 ; SSE-NEXT: pand %xmm15, %xmm0
3998 ; SSE-NEXT: por %xmm0, %xmm1
3999 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
4000 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4001 ; SSE-NEXT: movdqa %xmm1, %xmm0
4002 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
4003 ; SSE-NEXT: movdqa %xmm1, %xmm14
4004 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4005 ; SSE-NEXT: psrlq $48, %xmm12
4006 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm12[1]
4007 ; SSE-NEXT: movdqa %xmm6, %xmm1
4008 ; SSE-NEXT: pandn %xmm0, %xmm1
4009 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4010 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4011 ; SSE-NEXT: por %xmm1, %xmm0
4012 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4013 ; SSE-NEXT: movdqa %xmm6, %xmm3
4014 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4015 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
4016 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4017 ; SSE-NEXT: psrld $16, %xmm2
4018 ; SSE-NEXT: movdqa %xmm6, %xmm3
4019 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
4020 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4021 ; SSE-NEXT: movdqa %xmm7, %xmm2
4022 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
4023 ; SSE-NEXT: pand %xmm15, %xmm0
4024 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
4025 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,1]
4026 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,5,4]
4027 ; SSE-NEXT: pandn %xmm2, %xmm15
4028 ; SSE-NEXT: por %xmm0, %xmm15
4029 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4030 ; SSE-NEXT: movdqa %xmm1, %xmm0
4031 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
4032 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4033 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4034 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4035 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4036 ; SSE-NEXT: psrld $16, %xmm2
4037 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4038 ; SSE-NEXT: movdqa %xmm4, %xmm2
4039 ; SSE-NEXT: pandn %xmm0, %xmm2
4040 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,5,6,6]
4041 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4042 ; SSE-NEXT: pand %xmm4, %xmm0
4043 ; SSE-NEXT: por %xmm2, %xmm0
4044 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4045 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm5[1,1]
4046 ; SSE-NEXT: movaps %xmm5, %xmm3
4047 ; SSE-NEXT: movaps {{.*#+}} xmm12 = [65535,65535,0,0,0,65535,65535,65535]
4048 ; SSE-NEXT: movaps %xmm12, %xmm1
4049 ; SSE-NEXT: andnps %xmm2, %xmm1
4050 ; SSE-NEXT: pand %xmm12, %xmm0
4051 ; SSE-NEXT: orps %xmm0, %xmm1
4052 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4053 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4054 ; SSE-NEXT: movdqa %xmm1, %xmm0
4055 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
4056 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4057 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4058 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4059 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4060 ; SSE-NEXT: psrld $16, %xmm2
4061 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4062 ; SSE-NEXT: movdqa %xmm4, %xmm2
4063 ; SSE-NEXT: pandn %xmm0, %xmm2
4064 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,6,6]
4065 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4066 ; SSE-NEXT: pand %xmm4, %xmm0
4067 ; SSE-NEXT: por %xmm2, %xmm0
4068 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4069 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm10[1,1]
4070 ; SSE-NEXT: movaps %xmm12, %xmm1
4071 ; SSE-NEXT: andnps %xmm2, %xmm1
4072 ; SSE-NEXT: pand %xmm12, %xmm0
4073 ; SSE-NEXT: orps %xmm0, %xmm1
4074 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4075 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4076 ; SSE-NEXT: movdqa %xmm1, %xmm0
4077 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
4078 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4079 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4080 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4081 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4082 ; SSE-NEXT: psrld $16, %xmm2
4083 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4084 ; SSE-NEXT: movdqa %xmm4, %xmm2
4085 ; SSE-NEXT: pandn %xmm0, %xmm2
4086 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5,6,6]
4087 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4088 ; SSE-NEXT: pand %xmm4, %xmm0
4089 ; SSE-NEXT: por %xmm2, %xmm0
4090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4091 ; SSE-NEXT: movaps %xmm9, %xmm1
4092 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm7[1,1]
4093 ; SSE-NEXT: movaps %xmm7, %xmm5
4094 ; SSE-NEXT: movaps %xmm12, %xmm10
4095 ; SSE-NEXT: andnps %xmm1, %xmm10
4096 ; SSE-NEXT: pand %xmm12, %xmm0
4097 ; SSE-NEXT: orps %xmm0, %xmm10
4098 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4099 ; SSE-NEXT: movdqa %xmm1, %xmm0
4100 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
4101 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4102 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4103 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4104 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4105 ; SSE-NEXT: movdqa %xmm2, %xmm1
4106 ; SSE-NEXT: psrld $16, %xmm1
4107 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4108 ; SSE-NEXT: movdqa %xmm4, %xmm1
4109 ; SSE-NEXT: pandn %xmm0, %xmm1
4110 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4111 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6]
4112 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
4113 ; SSE-NEXT: pand %xmm4, %xmm0
4114 ; SSE-NEXT: por %xmm1, %xmm0
4115 ; SSE-NEXT: pand %xmm12, %xmm0
4116 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4118 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1]
4119 ; SSE-NEXT: andnps %xmm1, %xmm12
4120 ; SSE-NEXT: orps %xmm0, %xmm12
4121 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4122 ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4123 ; SSE-NEXT: # xmm1 = xmm1[2,2],mem[2,0]
4124 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4125 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,5,6,6,7]
4126 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0]
4127 ; SSE-NEXT: andps %xmm2, %xmm1
4128 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
4129 ; SSE-NEXT: andnps %xmm0, %xmm2
4130 ; SSE-NEXT: orps %xmm1, %xmm2
4131 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,65535,65535]
4132 ; SSE-NEXT: andps %xmm1, %xmm2
4133 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3]
4134 ; SSE-NEXT: andnps %xmm0, %xmm1
4135 ; SSE-NEXT: orps %xmm2, %xmm1
4136 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4137 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4138 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
4139 ; SSE-NEXT: movdqa %xmm4, %xmm0
4140 ; SSE-NEXT: pandn %xmm1, %xmm0
4141 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4142 ; SSE-NEXT: pand %xmm4, %xmm11
4143 ; SSE-NEXT: por %xmm0, %xmm11
4144 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535]
4145 ; SSE-NEXT: movdqa %xmm2, %xmm1
4146 ; SSE-NEXT: pandn %xmm11, %xmm1
4147 ; SSE-NEXT: movaps %xmm3, %xmm11
4148 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4149 ; SSE-NEXT: # xmm11 = xmm11[1],mem[0]
4150 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4151 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
4152 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1]
4153 ; SSE-NEXT: andps %xmm2, %xmm11
4154 ; SSE-NEXT: orps %xmm1, %xmm11
4155 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4156 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
4157 ; SSE-NEXT: movdqa %xmm4, %xmm1
4158 ; SSE-NEXT: pandn %xmm0, %xmm1
4159 ; SSE-NEXT: pslldq {{.*#+}} xmm13 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4160 ; SSE-NEXT: pand %xmm4, %xmm13
4161 ; SSE-NEXT: por %xmm1, %xmm13
4162 ; SSE-NEXT: movdqa %xmm2, %xmm1
4163 ; SSE-NEXT: pandn %xmm13, %xmm1
4164 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4165 ; SSE-NEXT: movapd %xmm7, %xmm13
4166 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4167 ; SSE-NEXT: # xmm13 = xmm13[1],mem[0]
4168 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4169 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
4170 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[2,0],xmm0[2,1]
4171 ; SSE-NEXT: andps %xmm2, %xmm13
4172 ; SSE-NEXT: orps %xmm1, %xmm13
4173 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4174 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
4175 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4176 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4177 ; SSE-NEXT: pand %xmm4, %xmm1
4178 ; SSE-NEXT: pandn %xmm0, %xmm4
4179 ; SSE-NEXT: por %xmm1, %xmm4
4180 ; SSE-NEXT: movaps %xmm5, %xmm1
4181 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4182 ; SSE-NEXT: # xmm1 = xmm1[1],mem[0]
4183 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
4184 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[2,1]
4185 ; SSE-NEXT: andps %xmm2, %xmm1
4186 ; SSE-NEXT: pandn %xmm4, %xmm2
4187 ; SSE-NEXT: por %xmm1, %xmm2
4188 ; SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4189 ; SSE-NEXT: # xmm8 = xmm8[0],mem[0]
4190 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4191 ; SSE-NEXT: # xmm8 = xmm8[2,0],mem[2,1]
4192 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4193 ; SSE-NEXT: # xmm4 = mem[0,0,1,1]
4194 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,1,1]
4195 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
4196 ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535,0,0,0,65535]
4197 ; SSE-NEXT: movdqa %xmm6, %xmm1
4198 ; SSE-NEXT: pandn %xmm4, %xmm1
4199 ; SSE-NEXT: andps %xmm6, %xmm8
4200 ; SSE-NEXT: por %xmm8, %xmm1
4201 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4202 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4203 ; SSE-NEXT: # xmm3 = xmm3[0],mem[0]
4204 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4205 ; SSE-NEXT: # xmm3 = xmm3[2,0],mem[2,1]
4206 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4207 ; SSE-NEXT: # xmm4 = mem[0,0,1,1]
4208 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,1,1]
4209 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
4210 ; SSE-NEXT: movdqa %xmm6, %xmm8
4211 ; SSE-NEXT: pandn %xmm4, %xmm8
4212 ; SSE-NEXT: andps %xmm6, %xmm3
4213 ; SSE-NEXT: por %xmm3, %xmm8
4214 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4215 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4216 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
4217 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4218 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
4219 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,1,1]
4220 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[0,0,1,1]
4221 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
4222 ; SSE-NEXT: movdqa %xmm6, %xmm9
4223 ; SSE-NEXT: pandn %xmm4, %xmm9
4224 ; SSE-NEXT: andps %xmm6, %xmm0
4225 ; SSE-NEXT: por %xmm0, %xmm9
4226 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4227 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4228 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
4229 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4230 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
4231 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4232 ; SSE-NEXT: # xmm4 = mem[0,0,1,1]
4233 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm14[0,0,1,1]
4234 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
4235 ; SSE-NEXT: andps %xmm6, %xmm0
4236 ; SSE-NEXT: pandn %xmm4, %xmm6
4237 ; SSE-NEXT: por %xmm0, %xmm6
4238 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4239 ; SSE-NEXT: movdqa %xmm6, 336(%rax)
4240 ; SSE-NEXT: movdqa %xmm9, 224(%rax)
4241 ; SSE-NEXT: movdqa %xmm8, 112(%rax)
4242 ; SSE-NEXT: movdqa %xmm1, (%rax)
4243 ; SSE-NEXT: movdqa %xmm2, 288(%rax)
4244 ; SSE-NEXT: movaps %xmm13, 176(%rax)
4245 ; SSE-NEXT: movaps %xmm11, 64(%rax)
4246 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4247 ; SSE-NEXT: movaps %xmm0, 416(%rax)
4248 ; SSE-NEXT: movaps %xmm12, 368(%rax)
4249 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4250 ; SSE-NEXT: movaps %xmm0, 352(%rax)
4251 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4252 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
4253 ; SSE-NEXT: movaps %xmm0, 304(%rax)
4254 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4255 ; SSE-NEXT: movaps %xmm0, 272(%rax)
4256 ; SSE-NEXT: movaps %xmm10, 256(%rax)
4257 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4258 ; SSE-NEXT: movaps %xmm0, 240(%rax)
4259 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4260 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
4261 ; SSE-NEXT: movaps %xmm0, 192(%rax)
4262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4263 ; SSE-NEXT: movaps %xmm0, 160(%rax)
4264 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4265 ; SSE-NEXT: movaps %xmm0, 144(%rax)
4266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4267 ; SSE-NEXT: movaps %xmm0, 128(%rax)
4268 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4269 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
4270 ; SSE-NEXT: movaps %xmm0, 80(%rax)
4271 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4272 ; SSE-NEXT: movaps %xmm0, 48(%rax)
4273 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4274 ; SSE-NEXT: movaps %xmm0, 32(%rax)
4275 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4276 ; SSE-NEXT: movaps %xmm0, 16(%rax)
4277 ; SSE-NEXT: movdqa %xmm15, 320(%rax)
4278 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
4279 ; SSE-NEXT: movaps %xmm0, 208(%rax)
4280 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4281 ; SSE-NEXT: movaps %xmm0, 96(%rax)
4282 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4283 ; SSE-NEXT: movaps %xmm0, 432(%rax)
4284 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4285 ; SSE-NEXT: movaps %xmm0, 400(%rax)
4286 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4287 ; SSE-NEXT: movaps %xmm0, 384(%rax)
4288 ; SSE-NEXT: addq $680, %rsp # imm = 0x2A8
4291 ; AVX-LABEL: store_i16_stride7_vf32:
4293 ; AVX-NEXT: subq $584, %rsp # imm = 0x248
4294 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4295 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm12
4296 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4297 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm9
4298 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm10
4299 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4300 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
4301 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4302 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4303 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
4304 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4305 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
4306 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
4307 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm2
4308 ; AVX-NEXT: vmovdqa 48(%rdx), %xmm4
4309 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2]
4310 ; AVX-NEXT: vmovdqa 48(%rcx), %xmm3
4311 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[3,3,3,3,4,5,6,7]
4312 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
4313 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7]
4314 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
4315 ; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
4316 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
4317 ; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
4318 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm6
4319 ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm5
4320 ; AVX-NEXT: vmovdqa 48(%r8), %xmm2
4321 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
4322 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7]
4323 ; AVX-NEXT: vmovdqa 48(%r9), %xmm13
4324 ; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4325 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
4326 ; AVX-NEXT: vmovdqa 48(%rax), %xmm5
4327 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
4328 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4,5,6,7]
4329 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4330 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1]
4331 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
4332 ; AVX-NEXT: vpsrld $16, %xmm13, %xmm7
4333 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
4334 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4335 ; AVX-NEXT: vpsrlq $48, %xmm3, %xmm6
4336 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1]
4337 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
4338 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
4339 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
4340 ; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4341 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
4342 ; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0
4343 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
4344 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
4345 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
4346 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5],xmm1[6,7]
4347 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6,7]
4348 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[3,3,3,3]
4349 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7]
4350 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4351 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
4352 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7]
4353 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4354 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7]
4355 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
4356 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
4357 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4358 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
4359 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4360 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
4361 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4362 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4363 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm7
4364 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4365 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
4366 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
4367 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm8
4368 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4369 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2]
4370 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7]
4371 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
4372 ; AVX-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4373 ; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9]
4374 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
4375 ; AVX-NEXT: vandnps %ymm0, %ymm11, %ymm0
4376 ; AVX-NEXT: vandps %ymm1, %ymm11, %ymm1
4377 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4378 ; AVX-NEXT: vmovdqa 16(%r8), %xmm7
4379 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4380 ; AVX-NEXT: vmovdqa 16(%r9), %xmm11
4381 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4382 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
4383 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
4384 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4385 ; AVX-NEXT: vmovdqa 16(%rax), %xmm8
4386 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4387 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3]
4388 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
4389 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
4390 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4391 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4392 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
4393 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
4394 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
4395 ; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0
4396 ; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
4397 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
4398 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4399 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm11
4400 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm12
4401 ; AVX-NEXT: vpsrld $16, %xmm11, %xmm0
4402 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
4403 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
4404 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4405 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4406 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4407 ; AVX-NEXT: vmovdqa 32(%rcx), %xmm7
4408 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
4409 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
4410 ; AVX-NEXT: vmovdqa 32(%rdx), %xmm10
4411 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2]
4412 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7]
4413 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
4414 ; AVX-NEXT: vmovdqa %xmm7, %xmm1
4415 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4416 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6]
4417 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
4418 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
4419 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
4420 ; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0
4421 ; AVX-NEXT: vandps %ymm7, %ymm6, %ymm6
4422 ; AVX-NEXT: vorps %ymm0, %ymm6, %ymm7
4423 ; AVX-NEXT: vmovdqa 32(%r8), %xmm14
4424 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4425 ; AVX-NEXT: vmovdqa 32(%r9), %xmm0
4426 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4427 ; AVX-NEXT: vmovdqa 32(%rax), %xmm6
4428 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
4429 ; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4430 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7]
4431 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3]
4432 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
4433 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
4434 ; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7
4435 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14
4436 ; AVX-NEXT: vorps %ymm7, %ymm14, %ymm7
4437 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4438 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm7
4439 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
4440 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
4441 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4442 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
4443 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
4444 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3
4445 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4446 ; AVX-NEXT: vpsrld $16, %xmm9, %xmm4
4447 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4448 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
4449 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
4450 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
4451 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
4452 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4
4453 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
4454 ; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3
4455 ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
4456 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
4457 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
4458 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4459 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4460 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4461 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,1,0,1]
4462 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5],xmm2[6,7]
4463 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm7[0,2],xmm5[1,3]
4464 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
4465 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
4466 ; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3
4467 ; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2
4468 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
4469 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4470 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
4471 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
4472 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1]
4473 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
4474 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
4475 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
4476 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
4477 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
4478 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
4479 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
4480 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
4481 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
4482 ; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1
4483 ; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2
4484 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
4485 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4486 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1]
4487 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
4488 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
4489 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0]
4490 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
4491 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
4492 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
4493 ; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
4494 ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
4495 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4496 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4497 ; AVX-NEXT: vmovdqa (%rdx), %xmm9
4498 ; AVX-NEXT: vmovdqa (%rcx), %xmm7
4499 ; AVX-NEXT: vpsrld $16, %xmm7, %xmm0
4500 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
4501 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
4502 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
4503 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
4504 ; AVX-NEXT: vmovdqa (%rsi), %xmm0
4505 ; AVX-NEXT: vmovdqa (%rdi), %xmm14
4506 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
4507 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
4508 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
4509 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
4510 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
4511 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
4512 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
4513 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
4514 ; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
4515 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm15
4516 ; AVX-NEXT: vmovdqa (%r8), %xmm1
4517 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
4518 ; AVX-NEXT: vmovdqa (%r9), %xmm5
4519 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4520 ; AVX-NEXT: vmovdqa (%rax), %xmm2
4521 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
4522 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4523 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1]
4524 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
4525 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1]
4526 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0]
4527 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
4528 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1
4529 ; AVX-NEXT: vandps %ymm4, %ymm15, %ymm11
4530 ; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
4531 ; AVX-NEXT: vorps %ymm1, %ymm11, %ymm1
4532 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4533 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
4534 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4535 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
4536 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4537 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4538 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
4539 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
4540 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
4541 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2]
4542 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
4543 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
4544 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
4545 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
4546 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
4547 ; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
4548 ; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
4549 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4550 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4551 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4552 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
4553 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3]
4554 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
4555 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
4556 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
4557 ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
4558 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
4559 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4560 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4561 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
4562 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4563 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4564 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
4565 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4566 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
4567 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
4568 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
4569 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4570 ; AVX-NEXT: vpsrld $16, %xmm5, %xmm1
4571 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4572 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4573 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
4574 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
4575 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
4576 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
4577 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
4578 ; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
4579 ; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1
4580 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4581 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4582 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4583 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4584 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1]
4585 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7]
4586 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3]
4587 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
4588 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
4589 ; AVX-NEXT: vandps %ymm3, %ymm0, %ymm0
4590 ; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
4591 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
4592 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4593 ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0
4594 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
4595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4596 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
4597 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4598 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
4599 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4600 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
4601 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
4602 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
4603 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
4604 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
4605 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4606 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm1
4607 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4608 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4609 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
4610 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
4611 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
4612 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3]
4613 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
4614 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
4615 ; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
4616 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
4617 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
4618 ; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0
4619 ; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3
4620 ; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
4621 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4622 ; AVX-NEXT: vmovdqa %xmm10, %xmm4
4623 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4624 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
4625 ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
4626 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
4627 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4628 ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4629 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
4630 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
4631 ; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
4632 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3
4633 ; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
4634 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4635 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4636 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
4637 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
4638 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
4639 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
4640 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
4641 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
4642 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7]
4643 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
4644 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
4645 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7]
4646 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
4647 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
4648 ; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
4649 ; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
4650 ; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
4651 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4652 ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0
4653 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
4654 ; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4655 ; AVX-NEXT: # xmm3 = mem[0,0,1,1]
4656 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
4657 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4658 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
4659 ; AVX-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
4660 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4661 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
4662 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
4663 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
4664 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
4665 ; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0
4666 ; AVX-NEXT: vandps %ymm3, %ymm8, %ymm3
4667 ; AVX-NEXT: vorps %ymm0, %ymm3, %ymm3
4668 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4669 ; AVX-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
4670 ; AVX-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7]
4671 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4672 ; AVX-NEXT: # xmm5 = mem[0,1,0,1]
4673 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0]
4674 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7]
4675 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm10
4676 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
4677 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
4678 ; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
4679 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
4680 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
4681 ; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
4682 ; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5
4683 ; AVX-NEXT: vorps %ymm5, %ymm3, %ymm5
4684 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
4685 ; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
4686 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
4687 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4688 ; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
4689 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3]
4690 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10
4691 ; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
4692 ; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10
4693 ; AVX-NEXT: vorps %ymm3, %ymm10, %ymm3
4694 ; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
4695 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4696 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
4697 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
4698 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
4699 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4700 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
4701 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7]
4702 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
4703 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
4704 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
4705 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
4706 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7]
4707 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
4708 ; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3
4709 ; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10
4710 ; AVX-NEXT: vorps %ymm3, %ymm10, %ymm3
4711 ; AVX-NEXT: vpsrlq $48, %xmm7, %xmm7
4712 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1]
4713 ; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4714 ; AVX-NEXT: # xmm9 = mem[0,0,1,1]
4715 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
4716 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4717 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
4718 ; AVX-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3]
4719 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
4720 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7]
4721 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
4722 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4
4723 ; AVX-NEXT: vandnps %ymm7, %ymm8, %ymm7
4724 ; AVX-NEXT: vandps %ymm4, %ymm8, %ymm4
4725 ; AVX-NEXT: vorps %ymm7, %ymm4, %ymm4
4726 ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4727 ; AVX-NEXT: # xmm7 = mem[0,1,0,1]
4728 ; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0]
4729 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
4730 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm8
4731 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
4732 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
4733 ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
4734 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
4735 ; AVX-NEXT: vandps %ymm6, %ymm4, %ymm2
4736 ; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
4737 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
4738 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4739 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
4740 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
4741 ; AVX-NEXT: vmovaps %ymm5, 320(%rax)
4742 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4743 ; AVX-NEXT: vmovaps %ymm0, 288(%rax)
4744 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4745 ; AVX-NEXT: vmovaps %ymm0, 192(%rax)
4746 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
4747 ; AVX-NEXT: vmovaps %ymm0, 128(%rax)
4748 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4749 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
4750 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4751 ; AVX-NEXT: vmovaps %ymm1, (%rax)
4752 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4753 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
4754 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4755 ; AVX-NEXT: vmovaps %ymm1, 352(%rax)
4756 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4757 ; AVX-NEXT: vmovaps %ymm1, 256(%rax)
4758 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4759 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
4760 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4761 ; AVX-NEXT: vmovaps %xmm1, 416(%rax)
4762 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4763 ; AVX-NEXT: vmovaps %xmm1, 432(%rax)
4764 ; AVX-NEXT: vmovdqa %xmm14, 384(%rax)
4765 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4766 ; AVX-NEXT: vmovaps %xmm0, 400(%rax)
4767 ; AVX-NEXT: addq $584, %rsp # imm = 0x248
4768 ; AVX-NEXT: vzeroupper
4771 ; AVX2-LABEL: store_i16_stride7_vf32:
4773 ; AVX2-NEXT: subq $616, %rsp # imm = 0x268
4774 ; AVX2-NEXT: vmovdqa (%rdi), %ymm13
4775 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4776 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm8
4777 ; AVX2-NEXT: vmovdqa (%rsi), %ymm15
4778 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4779 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm10
4780 ; AVX2-NEXT: vmovdqa (%rdx), %ymm12
4781 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4782 ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm11
4783 ; AVX2-NEXT: vmovdqa (%rcx), %ymm14
4784 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4785 ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm9
4786 ; AVX2-NEXT: vmovdqa 32(%r8), %ymm6
4787 ; AVX2-NEXT: vmovdqa 32(%r9), %ymm7
4788 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
4789 ; AVX2-NEXT: vpermd %ymm8, %ymm0, %ymm1
4790 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7]
4791 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
4792 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
4793 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
4794 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,0,4,0,0]
4795 ; AVX2-NEXT: vpermd %ymm11, %ymm2, %ymm4
4796 ; AVX2-NEXT: vpermd %ymm13, %ymm0, %ymm0
4797 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,3,2,3,4,7,6,7]
4798 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
4799 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm5, %ymm0
4800 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
4801 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
4802 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
4803 ; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
4804 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
4805 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
4806 ; AVX2-NEXT: vpermd %ymm12, %ymm2, %ymm2
4807 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
4808 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
4809 ; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
4810 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4]
4811 ; AVX2-NEXT: vpermd %ymm6, %ymm3, %ymm5
4812 ; AVX2-NEXT: vmovdqa %ymm6, %ymm12
4813 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4814 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
4815 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7]
4816 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
4817 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
4818 ; AVX2-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2
4819 ; AVX2-NEXT: vmovdqa (%r8), %ymm5
4820 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4821 ; AVX2-NEXT: vpermd %ymm5, %ymm3, %ymm3
4822 ; AVX2-NEXT: vmovdqa (%r9), %ymm5
4823 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4824 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7]
4825 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
4826 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
4827 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4828 ; AVX2-NEXT: vmovdqa 32(%rax), %ymm13
4829 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,3,0,0,0,4]
4830 ; AVX2-NEXT: vpermd %ymm13, %ymm4, %ymm5
4831 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
4832 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
4833 ; AVX2-NEXT: vmovdqa (%rax), %ymm5
4834 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4835 ; AVX2-NEXT: vpermd %ymm5, %ymm4, %ymm4
4836 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
4837 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
4838 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
4839 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4840 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
4841 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4842 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4843 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
4844 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4845 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
4846 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
4847 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4848 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
4849 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
4850 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4851 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
4852 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
4853 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
4854 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4855 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
4856 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4857 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7]
4858 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
4859 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
4860 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4861 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4862 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
4863 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
4864 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
4865 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
4866 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4867 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4868 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6]
4869 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
4870 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
4871 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4872 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4873 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
4874 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
4875 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
4876 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
4877 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
4878 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
4879 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
4880 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
4881 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
4882 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
4883 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
4884 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4885 ; AVX2-NEXT: vpbroadcastd 60(%r8), %ymm1
4886 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
4887 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4888 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
4889 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
4890 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
4891 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4892 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7]
4893 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
4894 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
4895 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
4896 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4897 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0
4898 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4899 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6
4900 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
4901 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
4902 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
4903 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4904 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm11
4905 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10
4906 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4907 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
4908 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4909 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4910 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
4911 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm12
4912 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
4913 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4914 ; AVX2-NEXT: vmovdqa (%rsi), %xmm13
4915 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
4916 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
4917 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4918 ; AVX2-NEXT: vmovdqa (%rcx), %xmm8
4919 ; AVX2-NEXT: vmovdqa (%rdx), %xmm7
4920 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
4921 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
4922 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
4923 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4924 ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm2
4925 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0
4926 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4927 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm1
4928 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4929 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4930 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,1,3,2,4,5,6,7]
4931 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
4932 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
4933 ; AVX2-NEXT: vpbroadcastd 32(%rax), %ymm9
4934 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
4935 ; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm9, %ymm0
4936 ; AVX2-NEXT: vmovdqa (%r9), %xmm5
4937 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4938 ; AVX2-NEXT: vmovdqa (%r8), %xmm1
4939 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4940 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
4941 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm1[0,1,3,2,4,5,6,7]
4942 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
4943 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
4944 ; AVX2-NEXT: vpbroadcastd (%rax), %ymm14
4945 ; AVX2-NEXT: vpblendvb %ymm3, %ymm15, %ymm14, %ymm3
4946 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
4947 ; AVX2-NEXT: vpblendvb %ymm14, %ymm12, %ymm0, %ymm15
4948 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm3, %ymm0
4949 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
4950 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
4951 ; AVX2-NEXT: vpshufb %xmm0, %xmm11, %xmm2
4952 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,2]
4953 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
4954 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,3,3,3,4,5,6,7]
4955 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
4956 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4957 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[1,1,2,3]
4958 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7]
4959 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
4960 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4961 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
4962 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm3, %ymm9
4963 ; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm0
4964 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2]
4965 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
4966 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[3,3,3,3,4,5,6,7]
4967 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
4968 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4969 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[1,1,2,3]
4970 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2],xmm14[3,4],xmm3[5],xmm14[6,7]
4971 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
4972 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
4973 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm3, %ymm0
4974 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,7,6]
4975 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
4976 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
4977 ; AVX2-NEXT: vpbroadcastd 36(%rax), %ymm4
4978 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
4979 ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3
4980 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
4981 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
4982 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
4983 ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm4
4984 ; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
4985 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
4986 ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm3, %ymm14
4987 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
4988 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4989 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
4990 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
4991 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
4992 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
4993 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4994 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
4995 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
4996 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
4997 ; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
4998 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
4999 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
5000 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
5001 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
5002 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
5003 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
5004 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
5005 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
5006 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5007 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5008 ; AVX2-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
5009 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5010 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5011 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
5012 ; AVX2-NEXT: vpbroadcastd 40(%rax), %ymm3
5013 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5014 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5015 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5016 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
5017 ; AVX2-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
5018 ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
5019 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
5020 ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5
5021 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
5022 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
5023 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
5024 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
5025 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5026 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5027 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
5028 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5029 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
5030 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
5031 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5032 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5033 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
5034 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5035 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
5036 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
5037 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
5038 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
5039 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5040 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5041 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5042 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
5043 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,3,6,6,6,7]
5044 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5045 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
5046 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
5047 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
5048 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5049 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7]
5050 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
5051 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5052 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5053 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
5054 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5055 ; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5056 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
5057 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
5058 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
5059 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5060 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
5061 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
5062 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
5063 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
5064 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5065 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5066 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
5067 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
5068 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
5069 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
5070 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
5071 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6]
5072 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
5073 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
5074 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
5075 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
5076 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5077 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5078 ; AVX2-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5]
5079 ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5080 ; AVX2-NEXT: # ymm5 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
5081 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
5082 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
5083 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5084 ; AVX2-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
5085 ; AVX2-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5086 ; AVX2-NEXT: # ymm6 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
5087 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
5088 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
5089 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
5090 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
5091 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5092 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
5093 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5]
5094 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
5095 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5]
5096 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7,8,9],ymm5[10],ymm7[11,12],ymm5[13],ymm7[14,15]
5097 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7]
5098 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
5099 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4]
5100 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
5101 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
5102 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
5103 ; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5
5104 ; AVX2-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5105 ; AVX2-NEXT: # ymm6 = mem[0,0,2,1,4,4,6,5]
5106 ; AVX2-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5107 ; AVX2-NEXT: # ymm7 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
5108 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,0,0,0,4,4,4,4]
5109 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8,9,10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
5110 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
5111 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5112 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
5113 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
5114 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
5115 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
5116 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
5117 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
5118 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
5119 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
5120 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
5121 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7]
5122 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
5123 ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm9, %ymm7
5124 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
5125 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm6, %ymm4
5126 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
5127 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5128 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5129 ; AVX2-NEXT: vmovaps %ymm6, 96(%rax)
5130 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5131 ; AVX2-NEXT: vmovaps %ymm6, 320(%rax)
5132 ; AVX2-NEXT: vmovdqa %ymm5, 128(%rax)
5133 ; AVX2-NEXT: vmovdqa %ymm4, 352(%rax)
5134 ; AVX2-NEXT: vmovdqa %ymm3, 160(%rax)
5135 ; AVX2-NEXT: vmovdqa %ymm2, 192(%rax)
5136 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rax)
5137 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5138 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
5139 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
5140 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
5141 ; AVX2-NEXT: vmovdqa %ymm15, 224(%rax)
5142 ; AVX2-NEXT: vmovdqa %ymm0, 288(%rax)
5143 ; AVX2-NEXT: vmovdqa %ymm14, 256(%rax)
5144 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5145 ; AVX2-NEXT: vmovaps %ymm0, 416(%rax)
5146 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5147 ; AVX2-NEXT: vmovaps %ymm0, 384(%rax)
5148 ; AVX2-NEXT: addq $616, %rsp # imm = 0x268
5149 ; AVX2-NEXT: vzeroupper
5152 ; AVX2-FP-LABEL: store_i16_stride7_vf32:
5154 ; AVX2-FP-NEXT: subq $616, %rsp # imm = 0x268
5155 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm11
5156 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5157 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm8
5158 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm14
5159 ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5160 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm9
5161 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13
5162 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5163 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm12
5164 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm15
5165 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5166 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm10
5167 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm7
5168 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
5169 ; AVX2-FP-NEXT: vpermd %ymm8, %ymm0, %ymm1
5170 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
5171 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm3
5172 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
5173 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
5174 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,3,0,0,0,4,0,0]
5175 ; AVX2-FP-NEXT: vpermd %ymm12, %ymm3, %ymm5
5176 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
5177 ; AVX2-FP-NEXT: vpermd %ymm11, %ymm0, %ymm0
5178 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm2
5179 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
5180 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm2
5181 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
5182 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm5, %ymm2, %ymm2
5183 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
5184 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
5185 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm11
5186 ; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm2
5187 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm15, %ymm3
5188 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5189 ; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,0,4]
5190 ; AVX2-FP-NEXT: vpermd %ymm7, %ymm3, %ymm4
5191 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5192 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
5193 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
5194 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm11, %ymm5
5195 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
5196 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
5197 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm5
5198 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5199 ; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm3
5200 ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm5
5201 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5202 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm2
5203 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
5204 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5205 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm13
5206 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,0,0,3,0,0,0,4]
5207 ; AVX2-FP-NEXT: vpermd %ymm13, %ymm3, %ymm5
5208 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
5209 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
5210 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm5
5211 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5212 ; AVX2-FP-NEXT: vpermd %ymm5, %ymm3, %ymm3
5213 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
5214 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
5215 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm1
5216 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5217 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
5218 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5219 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5220 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5221 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5222 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6]
5223 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
5224 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5225 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
5226 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5227 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
5228 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
5229 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
5230 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5231 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5232 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5233 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7]
5234 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
5235 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
5236 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5237 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5238 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
5239 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
5240 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
5241 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5242 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5243 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6]
5244 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
5245 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
5246 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5247 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5248 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
5249 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7]
5250 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
5251 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
5252 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
5253 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
5254 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
5255 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5256 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5257 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5258 ; AVX2-FP-NEXT: vpbroadcastd 60(%r8), %ymm1
5259 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
5260 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5261 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
5262 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
5263 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
5264 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5265 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7]
5266 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
5267 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
5268 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5269 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5270 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm6
5271 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0
5272 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5273 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5274 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
5275 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5276 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
5277 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm12
5278 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm11
5279 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
5280 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
5281 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5282 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5283 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5284 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm9
5285 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm14
5286 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm13
5287 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
5288 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5289 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm8
5290 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm7
5291 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
5292 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
5293 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
5294 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
5295 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm2
5296 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0
5297 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5298 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1
5299 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5300 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5301 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
5302 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm0
5303 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
5304 ; AVX2-FP-NEXT: vpbroadcastd 32(%rax), %ymm10
5305 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
5306 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm10, %ymm0
5307 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm1
5308 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5309 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10
5310 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3]
5311 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
5312 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
5313 ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm15
5314 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm15, %ymm3
5315 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
5316 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm9, %ymm0, %ymm0
5317 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
5318 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm0
5319 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5320 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
5321 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm2
5322 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,2,2]
5323 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
5324 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
5325 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm4
5326 ; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm9
5327 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5328 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3]
5329 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
5330 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
5331 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
5332 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5333 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm2, %ymm4, %ymm2
5334 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
5335 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2]
5336 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
5337 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
5338 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
5339 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
5340 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
5341 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
5342 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm0, %ymm3, %ymm0
5343 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
5344 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm4
5345 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
5346 ; AVX2-FP-NEXT: vpbroadcastd 36(%rax), %ymm5
5347 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5348 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
5349 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
5350 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5351 ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm3
5352 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
5353 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
5354 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm15
5355 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5356 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5357 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
5358 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
5359 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
5360 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
5361 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
5362 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
5363 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
5364 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
5365 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5366 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
5367 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5368 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
5369 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
5370 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
5371 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
5372 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
5373 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5374 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5375 ; AVX2-FP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
5376 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5377 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5378 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
5379 ; AVX2-FP-NEXT: vpbroadcastd 40(%rax), %ymm3
5380 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5381 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5382 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload
5383 ; AVX2-FP-NEXT: # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
5384 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
5385 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
5386 ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5
5387 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
5388 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
5389 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
5390 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1
5391 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5392 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
5393 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5394 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[3,3,3,3,7,7,7,7]
5395 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8,9,10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
5396 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5397 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
5398 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5399 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[3,3,3,3,7,7,7,7]
5400 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
5401 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
5402 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
5403 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5404 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5405 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5406 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
5407 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5408 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
5409 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
5410 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
5411 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5412 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7]
5413 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
5414 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5415 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5416 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
5417 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
5418 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5419 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6]
5420 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
5421 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
5422 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[2,2,2,2,6,6,6,6]
5423 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
5424 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
5425 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
5426 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5427 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5428 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
5429 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7]
5430 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
5431 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
5432 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6]
5433 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
5434 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
5435 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
5436 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
5437 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
5438 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
5439 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
5440 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5441 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
5442 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5443 ; AVX2-FP-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5]
5444 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
5445 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5446 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
5447 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5448 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
5449 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
5450 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
5451 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
5452 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5453 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
5454 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm5
5455 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5]
5456 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
5457 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
5458 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7]
5459 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7,8,9],ymm6[10],ymm8[11,12],ymm6[13],ymm8[14,15]
5460 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
5461 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
5462 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
5463 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
5464 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5465 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm7, %ymm7
5466 ; AVX2-FP-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
5467 ; AVX2-FP-NEXT: # ymm8 = mem[0,0,2,1,4,4,6,5]
5468 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7,8,9,10],ymm8[11],ymm7[12,13],ymm8[14],ymm7[15]
5469 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
5470 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
5471 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
5472 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
5473 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
5474 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
5475 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6
5476 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5]
5477 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15]
5478 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
5479 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[0,1,1,3,4,5,5,7]
5480 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
5481 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
5482 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
5483 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4
5484 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
5485 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5486 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5487 ; AVX2-FP-NEXT: vmovaps %ymm6, 96(%rax)
5488 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5489 ; AVX2-FP-NEXT: vmovaps %ymm6, 320(%rax)
5490 ; AVX2-FP-NEXT: vmovdqa %ymm5, 128(%rax)
5491 ; AVX2-FP-NEXT: vmovdqa %ymm4, 352(%rax)
5492 ; AVX2-FP-NEXT: vmovdqa %ymm3, 160(%rax)
5493 ; AVX2-FP-NEXT: vmovdqa %ymm2, 192(%rax)
5494 ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax)
5495 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5496 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
5497 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5498 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
5499 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
5500 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
5501 ; AVX2-FP-NEXT: vmovdqa %ymm0, 288(%rax)
5502 ; AVX2-FP-NEXT: vmovdqa %ymm15, 256(%rax)
5503 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5504 ; AVX2-FP-NEXT: vmovaps %ymm0, 416(%rax)
5505 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5506 ; AVX2-FP-NEXT: vmovaps %ymm0, 384(%rax)
5507 ; AVX2-FP-NEXT: addq $616, %rsp # imm = 0x268
5508 ; AVX2-FP-NEXT: vzeroupper
5509 ; AVX2-FP-NEXT: retq
5511 ; AVX2-FCP-LABEL: store_i16_stride7_vf32:
5512 ; AVX2-FCP: # %bb.0:
5513 ; AVX2-FCP-NEXT: subq $312, %rsp # imm = 0x138
5514 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5515 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9
5516 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm11
5517 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm8
5518 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm7
5519 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm4
5520 ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm10
5521 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm3
5522 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
5523 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[3,3,3,3,7,7,7,7]
5524 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
5525 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
5526 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
5527 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
5528 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
5529 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5530 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5531 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5532 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
5533 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1
5534 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6
5535 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5536 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7]
5537 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5538 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
5539 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
5540 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
5541 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5542 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
5543 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
5544 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5545 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5546 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,5,2,2,6,6,6,6]
5547 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm0
5548 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
5549 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5550 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
5551 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
5552 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
5553 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
5554 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
5555 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm4
5556 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5557 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5558 ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm5
5559 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5560 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
5561 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
5562 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm6
5563 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5564 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
5565 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5566 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,2,2,6,6,6,6]
5567 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
5568 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
5569 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
5570 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5571 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
5572 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
5573 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
5574 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5575 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
5576 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm12
5577 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
5578 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
5579 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm1
5580 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5]
5581 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
5582 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
5583 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11
5584 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm11[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
5585 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,5,7]
5586 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15]
5587 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
5588 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
5589 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5590 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm7
5591 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1
5592 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5]
5593 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
5594 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
5595 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,1,3,4,5,5,7]
5596 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4],ymm2[5],ymm8[6,7,8,9],ymm2[10],ymm8[11,12],ymm2[13],ymm8[14,15]
5597 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
5598 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
5599 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8
5600 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm9
5601 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm6
5602 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
5603 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm1
5604 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm9[0,0,2,1,4,4,6,5]
5605 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm13[3],ymm1[4,5],ymm13[6],ymm1[7,8,9,10],ymm13[11],ymm1[12,13],ymm13[14],ymm1[15]
5606 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm1[2,2,3,3]
5607 ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm5
5608 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
5609 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm0
5610 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
5611 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm13, %ymm0, %ymm0
5612 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3
5613 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5614 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[0,0,2,1,4,4,6,5]
5615 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15]
5616 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5617 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm1
5618 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
5619 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
5620 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
5621 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm7, %ymm0, %ymm0
5622 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5623 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm8, %ymm1, %ymm0
5624 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5625 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
5626 ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm1
5627 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
5628 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm3
5629 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
5630 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm7
5631 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5632 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5633 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
5634 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
5635 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0]
5636 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm2
5637 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
5638 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm8
5639 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
5640 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm8, %ymm2
5641 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5642 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5643 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3
5644 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm1
5645 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
5646 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm7, %ymm2, %ymm2
5647 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
5648 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4]
5649 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm3
5650 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
5651 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm8
5652 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
5653 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3
5654 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
5655 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5656 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm7
5657 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1
5658 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,3,0,0,0,4]
5659 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm8
5660 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
5661 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm3, %ymm8, %ymm3
5662 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm7
5663 ; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm1, %ymm7, %ymm1
5664 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
5665 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2
5666 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5667 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
5668 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5669 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5670 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[2,2,2,2,6,6,6,6]
5671 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
5672 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
5673 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[2,2,2,2,6,6,6,6]
5674 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
5675 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
5676 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5677 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
5678 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5679 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6]
5680 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1
5681 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
5682 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5683 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
5684 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
5685 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
5686 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5687 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6]
5688 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
5689 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
5690 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5691 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5692 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
5693 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
5694 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
5695 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
5696 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
5697 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
5698 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
5699 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
5700 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5701 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5702 ; AVX2-FCP-NEXT: vpbroadcastd 60(%r8), %ymm1
5703 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
5704 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5705 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
5706 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
5707 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
5708 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5709 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
5710 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
5711 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
5712 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
5713 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5714 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7
5715 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
5716 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5717 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
5718 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5719 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm0
5720 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5721 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5722 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5723 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5724 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
5725 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
5726 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
5727 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
5728 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5729 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5730 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm12
5731 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm13
5732 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9
5733 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3]
5734 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1
5735 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15
5736 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm8
5737 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
5738 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5739 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
5740 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
5741 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm10
5742 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm0
5743 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5744 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm1
5745 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5746 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5747 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
5748 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1
5749 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
5750 ; AVX2-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2
5751 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
5752 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
5753 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm1
5754 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5755 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm14
5756 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm1[0],xmm14[1],xmm1[1],xmm14[2],xmm1[2],xmm14[3],xmm1[3]
5757 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
5758 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
5759 ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm11
5760 ; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm11, %ymm3
5761 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
5762 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm12, %ymm0, %ymm0
5763 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5764 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm3, %ymm0
5765 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5766 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm0 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
5767 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm3
5768 ; AVX2-FCP-NEXT: vmovdqa %xmm7, %xmm10
5769 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5770 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,2]
5771 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
5772 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
5773 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5774 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm11
5775 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5776 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[1,1,2,3]
5777 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
5778 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
5779 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
5780 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5781 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
5782 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
5783 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2]
5784 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3],xmm0[4],xmm11[5,6],xmm0[7]
5785 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4
5786 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[1,1,2,3]
5787 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0,1],xmm4[2],xmm11[3,4],xmm4[5],xmm11[6,7]
5788 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
5789 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
5790 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0
5791 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
5792 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
5793 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
5794 ; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm11
5795 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
5796 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5
5797 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
5798 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
5799 ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
5800 ; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
5801 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
5802 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm6
5803 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm5
5804 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
5805 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
5806 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
5807 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
5808 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
5809 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
5810 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
5811 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
5812 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm0, %ymm3, %ymm0
5813 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
5814 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
5815 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
5816 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
5817 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
5818 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
5819 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
5820 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5821 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
5822 ; AVX2-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5823 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5824 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
5825 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
5826 ; AVX2-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7
5827 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
5828 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4
5829 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
5830 ; AVX2-FCP-NEXT: # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
5831 ; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
5832 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
5833 ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm2
5834 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm1, %ymm2, %ymm1
5835 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
5836 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
5837 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
5838 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5839 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5840 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
5841 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5842 ; AVX2-FCP-NEXT: vmovaps %ymm2, 320(%rax)
5843 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5844 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax)
5845 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5846 ; AVX2-FCP-NEXT: vmovaps %ymm2, 352(%rax)
5847 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5848 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax)
5849 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5850 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax)
5851 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
5852 ; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rax)
5853 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5854 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
5855 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5856 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
5857 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 288(%rax)
5858 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 256(%rax)
5859 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5860 ; AVX2-FCP-NEXT: vmovaps %ymm0, 416(%rax)
5861 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5862 ; AVX2-FCP-NEXT: vmovaps %ymm0, 384(%rax)
5863 ; AVX2-FCP-NEXT: addq $312, %rsp # imm = 0x138
5864 ; AVX2-FCP-NEXT: vzeroupper
5865 ; AVX2-FCP-NEXT: retq
5867 ; AVX512-LABEL: store_i16_stride7_vf32:
5869 ; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8
5870 ; AVX512-NEXT: vmovdqa (%rcx), %ymm1
5871 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
5872 ; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm0
5873 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27
5874 ; AVX512-NEXT: vmovdqa (%rdx), %ymm8
5875 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
5876 ; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm1
5877 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
5878 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5879 ; AVX512-NEXT: vmovdqa (%rsi), %ymm9
5880 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
5881 ; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm0
5882 ; AVX512-NEXT: vmovdqa (%rdi), %ymm11
5883 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
5884 ; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1
5885 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
5886 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5887 ; AVX512-NEXT: vmovdqa (%r9), %ymm1
5888 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
5889 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
5890 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16
5891 ; AVX512-NEXT: vmovdqa (%r8), %ymm4
5892 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
5893 ; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3
5894 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
5895 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
5896 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5897 ; AVX512-NEXT: vmovdqa 32(%r9), %xmm2
5898 ; AVX512-NEXT: vmovdqa 32(%r8), %xmm10
5899 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
5900 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5901 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
5902 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
5903 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
5904 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
5905 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
5906 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5907 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
5908 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10
5909 ; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm3
5910 ; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm4
5911 ; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3
5912 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5913 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12
5914 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm15
5915 ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm3
5916 ; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm4
5917 ; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3
5918 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5919 ; AVX512-NEXT: vmovdqa 32(%r9), %ymm13
5920 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm14
5921 ; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0
5922 ; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1
5923 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
5924 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5925 ; AVX512-NEXT: vmovdqa (%rcx), %xmm0
5926 ; AVX512-NEXT: vmovdqa (%rdx), %xmm3
5927 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
5928 ; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4
5929 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
5930 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
5931 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
5932 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
5933 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
5934 ; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm26
5935 ; AVX512-NEXT: vmovdqa (%r9), %xmm4
5936 ; AVX512-NEXT: vmovdqa (%r8), %xmm5
5937 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5938 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
5939 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
5940 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
5941 ; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm25
5942 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
5943 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5944 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
5945 ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm0
5946 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
5947 ; AVX512-NEXT: vmovdqa (%rax), %ymm3
5948 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5949 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
5950 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm6
5951 ; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
5952 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
5953 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5954 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
5955 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
5956 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
5957 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
5958 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm28
5959 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5960 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5961 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
5962 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
5963 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm29
5964 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
5965 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5966 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
5967 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
5968 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm30
5969 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5970 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5971 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
5972 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
5973 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm31
5974 ; AVX512-NEXT: vprold $16, %ymm13, %ymm4
5975 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
5976 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
5977 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5978 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
5979 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
5980 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
5981 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
5982 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5983 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
5984 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5985 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5986 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
5987 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
5988 ; AVX512-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
5989 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
5990 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
5991 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
5992 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
5993 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5994 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4
5995 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
5996 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
5997 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
5998 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
5999 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19
6000 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
6001 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5
6002 ; AVX512-NEXT: vprold $16, %xmm5, %xmm6
6003 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
6004 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
6005 ; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6006 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6007 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6008 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6009 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm4
6010 ; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1
6011 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5
6012 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
6013 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
6014 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6015 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6016 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18
6017 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6018 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6019 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
6020 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
6021 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
6022 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
6023 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6024 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
6025 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6026 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
6027 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6028 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6029 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
6030 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6031 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
6032 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
6033 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6034 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
6035 ; AVX512-NEXT: vmovdqa 32(%rax), %ymm2
6036 ; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm27
6037 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
6038 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6039 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
6040 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
6041 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
6042 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
6043 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6044 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5
6045 ; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm14
6046 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
6047 ; AVX512-NEXT: vmovdqa (%rsi), %xmm2
6048 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6049 ; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6050 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21
6051 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6052 ; AVX512-NEXT: vprold $16, %xmm2, %xmm2
6053 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
6054 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
6055 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24
6056 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6057 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
6058 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
6059 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
6060 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23
6061 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
6062 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
6063 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6064 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22
6065 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
6066 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
6067 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
6068 ; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6069 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
6070 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
6071 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
6072 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
6073 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4
6074 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6075 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
6076 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
6077 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
6078 ; AVX512-NEXT: vprold $16, %ymm16, %ymm1
6079 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
6080 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
6081 ; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
6082 ; AVX512-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
6083 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
6084 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
6085 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
6086 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
6087 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
6088 ; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
6089 ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
6090 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
6091 ; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
6092 ; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6093 ; AVX512-NEXT: # ymm6 = mem[2,1,3,2]
6094 ; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6095 ; AVX512-NEXT: # ymm5 = mem[2,2,2,3]
6096 ; AVX512-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
6097 ; AVX512-NEXT: # ymm1 = mem[0,2,2,3]
6098 ; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0
6099 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
6100 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
6101 ; AVX512-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
6102 ; AVX512-NEXT: # ymm0 = mem[2,1,3,3]
6103 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
6104 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
6105 ; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6106 ; AVX512-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7]
6107 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
6108 ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28
6109 ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29
6110 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
6111 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
6112 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
6113 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
6114 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9
6115 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
6116 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
6117 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
6118 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
6119 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
6120 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
6121 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6122 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
6123 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
6124 ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
6125 ; AVX512-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7]
6126 ; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
6127 ; AVX512-NEXT: # ymm3 = mem[0,0,2,1]
6128 ; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
6129 ; AVX512-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7]
6130 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
6131 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
6132 ; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
6133 ; AVX512-NEXT: # ymm12 = mem[0,0,1,1]
6134 ; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
6135 ; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7]
6136 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
6137 ; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
6138 ; AVX512-NEXT: # ymm17 = mem[2,2,2,3]
6139 ; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
6140 ; AVX512-NEXT: # ymm18 = mem[2,1,3,2]
6141 ; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6142 ; AVX512-NEXT: # ymm19 = mem[2,2,3,3]
6143 ; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
6144 ; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
6145 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
6146 ; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
6147 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
6148 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
6149 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
6150 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
6151 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
6152 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
6153 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
6154 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
6155 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
6156 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
6157 ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm5
6158 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
6159 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
6160 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
6161 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
6162 ; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4
6163 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
6164 ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2
6165 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3
6166 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
6167 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6168 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
6169 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
6170 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6171 ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
6172 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
6173 ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4
6174 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
6175 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6176 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
6177 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
6178 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
6179 ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3
6180 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
6181 ; AVX512-NEXT: vpbroadcastd (%rax), %ymm4
6182 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm5
6183 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
6184 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
6185 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
6186 ; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3
6187 ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5
6188 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
6189 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3
6190 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
6191 ; AVX512-NEXT: vpermd (%rax), %zmm6, %zmm6
6192 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
6193 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
6194 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6195 ; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax)
6196 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
6197 ; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rax)
6198 ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax)
6199 ; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax)
6200 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
6201 ; AVX512-NEXT: vmovdqa64 %zmm27, 384(%rax)
6202 ; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8
6203 ; AVX512-NEXT: vzeroupper
6206 ; AVX512-FCP-LABEL: store_i16_stride7_vf32:
6207 ; AVX512-FCP: # %bb.0:
6208 ; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108
6209 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm10
6210 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
6211 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1
6212 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6213 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4
6214 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
6215 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm3
6216 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
6217 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6218 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
6219 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6220 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9
6221 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm13
6222 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
6223 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm3
6224 ; AVX512-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
6225 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
6226 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
6227 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm4
6228 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
6229 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6230 ; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
6231 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6232 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm15
6233 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
6234 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4
6235 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11
6236 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
6237 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm8
6238 ; AVX512-FCP-NEXT: vpor %ymm4, %ymm8, %ymm3
6239 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6240 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1
6241 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
6242 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm5
6243 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1
6244 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6245 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
6246 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
6247 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
6248 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1
6249 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
6250 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6251 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1
6252 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
6253 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
6254 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2
6255 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
6256 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
6257 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm22
6258 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
6259 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1
6260 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
6261 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
6262 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
6263 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
6264 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
6265 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
6266 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24
6267 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
6268 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
6269 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
6270 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3
6271 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6272 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2
6273 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6274 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6275 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
6276 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
6277 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
6278 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
6279 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25
6280 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
6281 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
6282 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
6283 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
6284 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6285 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
6286 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
6287 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
6288 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
6289 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm21
6290 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
6291 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
6292 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
6293 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
6294 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm2
6295 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm7
6296 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
6297 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
6298 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
6299 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm23
6300 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
6301 ; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm6
6302 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
6303 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
6304 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
6305 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
6306 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
6307 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26
6308 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
6309 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6310 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
6311 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm1
6312 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
6313 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
6314 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2
6315 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0
6316 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
6317 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
6318 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
6319 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6320 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
6321 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
6322 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
6323 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm27
6324 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm6
6325 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm9
6326 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
6327 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
6328 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
6329 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
6330 ; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm28
6331 ; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm3
6332 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
6333 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
6334 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
6335 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5
6336 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
6337 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
6338 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
6339 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm31
6340 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6341 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm19
6342 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5
6343 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
6344 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6345 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
6346 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
6347 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6348 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0
6349 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
6350 ; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm10
6351 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
6352 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2
6353 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30
6354 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
6355 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
6356 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
6357 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
6358 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
6359 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
6360 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
6361 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
6362 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm13
6363 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm0
6364 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
6365 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
6366 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
6367 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm22, %zmm9
6368 ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm2
6369 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6370 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
6371 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm22, %ymm4
6372 ; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
6373 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
6374 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
6375 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
6376 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
6377 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
6378 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
6379 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm12
6380 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
6381 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
6382 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
6383 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm12
6384 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
6385 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
6386 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
6387 ; AVX512-FCP-NEXT: vprold $16, %ymm13, %ymm8
6388 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
6389 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
6390 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6391 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
6392 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6393 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
6394 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
6395 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
6396 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
6397 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
6398 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm8
6399 ; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm0
6400 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
6401 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
6402 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6403 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6404 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm0
6405 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
6406 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm14, %zmm3
6407 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
6408 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm15
6409 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6410 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm4
6411 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6412 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
6413 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
6414 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
6415 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
6416 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
6417 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6
6418 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
6419 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm16 & (zmm12 ^ zmm24))
6420 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm12))
6421 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5
6422 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
6423 ; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7
6424 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15))
6425 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
6426 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm26 ^ (zmm2 & (zmm8 ^ zmm26))
6427 ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12
6428 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13
6429 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
6430 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
6431 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8))
6432 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
6433 ; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
6434 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
6435 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
6436 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
6437 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
6438 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
6439 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
6440 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
6441 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
6442 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
6443 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
6444 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
6445 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
6446 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
6447 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
6448 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4
6449 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
6450 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
6451 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3
6452 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
6453 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5
6454 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm31))
6455 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4))
6456 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
6457 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
6458 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm2 & (zmm6 ^ zmm4))
6459 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm16 & (zmm21 ^ zmm25))
6460 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
6461 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30
6462 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
6463 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6464 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2
6465 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
6466 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4
6467 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
6468 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem)
6469 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
6470 ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2
6471 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
6472 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm23))
6473 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm21))
6474 ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3
6475 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4
6476 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
6477 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
6478 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
6479 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6480 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax)
6481 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
6482 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
6483 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
6484 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax)
6485 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax)
6486 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
6487 ; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108
6488 ; AVX512-FCP-NEXT: vzeroupper
6489 ; AVX512-FCP-NEXT: retq
6491 ; AVX512DQ-LABEL: store_i16_stride7_vf32:
6492 ; AVX512DQ: # %bb.0:
6493 ; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8
6494 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1
6495 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
6496 ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm0
6497 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm27
6498 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8
6499 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
6500 ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm1
6501 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
6502 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6503 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9
6504 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
6505 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm0
6506 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11
6507 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
6508 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1
6509 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
6510 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6511 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1
6512 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
6513 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
6514 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16
6515 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4
6516 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
6517 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3
6518 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
6519 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
6520 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6521 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2
6522 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm10
6523 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
6524 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6525 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
6526 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
6527 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6528 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
6529 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
6530 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6531 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2
6532 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10
6533 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm3
6534 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm4
6535 ; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3
6536 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6537 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12
6538 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm15
6539 ; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm3
6540 ; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm4
6541 ; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3
6542 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6543 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13
6544 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm14
6545 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0
6546 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1
6547 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
6548 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6549 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0
6550 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
6551 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
6552 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm4
6553 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
6554 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
6555 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
6556 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
6557 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
6558 ; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm26
6559 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4
6560 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5
6561 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6562 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
6563 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
6564 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
6565 ; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm25
6566 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
6567 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6568 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6569 ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm0
6570 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
6571 ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3
6572 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6573 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
6574 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm6
6575 ; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21
6576 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
6577 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6578 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6579 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
6580 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
6581 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
6582 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm28
6583 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
6584 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
6585 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
6586 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
6587 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm29
6588 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
6589 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
6590 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
6591 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
6592 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm30
6593 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
6594 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
6595 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
6596 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
6597 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm31
6598 ; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm4
6599 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
6600 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
6601 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6602 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
6603 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
6604 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
6605 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
6606 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6607 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
6608 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
6609 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
6610 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
6611 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
6612 ; AVX512DQ-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
6613 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
6614 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
6615 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
6616 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
6617 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6618 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
6619 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
6620 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
6621 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
6622 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
6623 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19
6624 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
6625 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5
6626 ; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm6
6627 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
6628 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
6629 ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6630 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
6631 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6632 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6633 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4
6634 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1
6635 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5
6636 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
6637 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
6638 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6639 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
6640 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18
6641 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
6642 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6643 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
6644 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
6645 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
6646 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
6647 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6648 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
6649 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6650 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
6651 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6652 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6653 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
6654 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6655 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
6656 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
6657 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6658 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
6659 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm2
6660 ; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm27
6661 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1
6662 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6663 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
6664 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
6665 ; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
6666 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
6667 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6668 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5
6669 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm14
6670 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
6671 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2
6672 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6673 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6674 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21
6675 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
6676 ; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm2
6677 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
6678 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
6679 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24
6680 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6681 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
6682 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
6683 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
6684 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
6685 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
6686 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
6687 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6688 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22
6689 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
6690 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
6691 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
6692 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
6693 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
6694 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
6695 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
6696 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
6697 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
6698 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
6699 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
6700 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
6701 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
6702 ; AVX512DQ-NEXT: vprold $16, %ymm16, %ymm1
6703 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
6704 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
6705 ; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
6706 ; AVX512DQ-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
6707 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
6708 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
6709 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
6710 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
6711 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
6712 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
6713 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
6714 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
6715 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
6716 ; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6717 ; AVX512DQ-NEXT: # ymm6 = mem[2,1,3,2]
6718 ; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
6719 ; AVX512DQ-NEXT: # ymm5 = mem[2,2,2,3]
6720 ; AVX512DQ-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
6721 ; AVX512DQ-NEXT: # ymm1 = mem[0,2,2,3]
6722 ; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0
6723 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
6724 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
6725 ; AVX512DQ-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
6726 ; AVX512DQ-NEXT: # ymm0 = mem[2,1,3,3]
6727 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
6728 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
6729 ; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6730 ; AVX512DQ-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7]
6731 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
6732 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28
6733 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29
6734 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
6735 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
6736 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
6737 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
6738 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9
6739 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
6740 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
6741 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
6742 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
6743 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
6744 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
6745 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6746 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
6747 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
6748 ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
6749 ; AVX512DQ-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7]
6750 ; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
6751 ; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1]
6752 ; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
6753 ; AVX512DQ-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7]
6754 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
6755 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
6756 ; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
6757 ; AVX512DQ-NEXT: # ymm12 = mem[0,0,1,1]
6758 ; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
6759 ; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7]
6760 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
6761 ; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
6762 ; AVX512DQ-NEXT: # ymm17 = mem[2,2,2,3]
6763 ; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
6764 ; AVX512DQ-NEXT: # ymm18 = mem[2,1,3,2]
6765 ; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6766 ; AVX512DQ-NEXT: # ymm19 = mem[2,2,3,3]
6767 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
6768 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
6769 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
6770 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
6771 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
6772 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
6773 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
6774 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
6775 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
6776 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
6777 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
6778 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
6779 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
6780 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
6781 ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm5
6782 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
6783 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
6784 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
6785 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
6786 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4
6787 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
6788 ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2
6789 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3
6790 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
6791 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6792 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
6793 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
6794 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6795 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
6796 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
6797 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4
6798 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
6799 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
6800 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
6801 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
6802 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
6803 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3
6804 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
6805 ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4
6806 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm5
6807 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
6808 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
6809 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
6810 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3
6811 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5
6812 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
6813 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3
6814 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
6815 ; AVX512DQ-NEXT: vpermd (%rax), %zmm6, %zmm6
6816 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
6817 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
6818 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6819 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax)
6820 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax)
6821 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%rax)
6822 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax)
6823 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax)
6824 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
6825 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 384(%rax)
6826 ; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8
6827 ; AVX512DQ-NEXT: vzeroupper
6828 ; AVX512DQ-NEXT: retq
6830 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32:
6831 ; AVX512DQ-FCP: # %bb.0:
6832 ; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108
6833 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm10
6834 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
6835 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1
6836 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6837 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4
6838 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
6839 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm3
6840 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
6841 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6842 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
6843 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6844 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9
6845 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm13
6846 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
6847 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm3
6848 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
6849 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
6850 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
6851 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm4
6852 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
6853 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6854 ; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
6855 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6856 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm15
6857 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
6858 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm4
6859 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11
6860 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
6861 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm8
6862 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm8, %ymm3
6863 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6864 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1
6865 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
6866 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm5
6867 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm1
6868 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6869 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
6870 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
6871 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
6872 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1
6873 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
6874 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6875 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1
6876 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
6877 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
6878 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2
6879 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
6880 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
6881 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm22
6882 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
6883 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1
6884 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
6885 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
6886 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm13[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
6887 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,2,2,2,6,6,6,6]
6888 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3],ymm2[4],ymm5[5,6,7,8],ymm2[9],ymm5[10,11],ymm2[12],ymm5[13,14,15]
6889 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
6890 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm24
6891 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
6892 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
6893 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
6894 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3
6895 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6896 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2
6897 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6898 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6899 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
6900 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
6901 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
6902 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,2,2,3,8,0,9,0]
6903 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm25
6904 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
6905 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,3,3,3,7,7,7,7]
6906 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
6907 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
6908 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
6909 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
6910 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
6911 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
6912 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
6913 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm0, %zmm21
6914 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
6915 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[3,3,3,3,7,7,7,7]
6916 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
6917 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,2,3,8,8,8,9]
6918 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm2
6919 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm7
6920 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
6921 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
6922 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
6923 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm23
6924 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
6925 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm6
6926 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
6927 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3,4],xmm6[5],xmm1[6,7]
6928 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
6929 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
6930 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
6931 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm26
6932 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
6933 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
6934 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
6935 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm1
6936 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,1,3,8,8,9,9]
6937 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
6938 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2
6939 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0
6940 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
6941 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
6942 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
6943 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6944 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
6945 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
6946 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
6947 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm27
6948 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm6
6949 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm9
6950 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
6951 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm7
6952 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
6953 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,11]
6954 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm3, %zmm28
6955 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm3
6956 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
6957 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
6958 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
6959 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5
6960 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
6961 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
6962 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,9,11,10]
6963 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm5, %zmm31
6964 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
6965 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm19
6966 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5
6967 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
6968 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
6969 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
6970 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
6971 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6972 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0
6973 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
6974 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm10
6975 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
6976 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm2
6977 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm30
6978 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
6979 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
6980 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5]
6981 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7,8,9],ymm6[10],ymm0[11,12],ymm6[13],ymm0[14,15]
6982 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
6983 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
6984 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,5,7]
6985 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
6986 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm13
6987 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm13, %ymm0
6988 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
6989 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
6990 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
6991 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm22, %zmm9
6992 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm2
6993 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
6994 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,1,4,5,4,5,5,7,12,13,10,10,14,14,14,14]
6995 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm22, %ymm4
6996 ; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
6997 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
6998 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm8[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
6999 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,3,3,3,7,7,7,7]
7000 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm4[1],ymm12[2,3],ymm4[4],ymm12[5,6,7,8],ymm4[9],ymm12[10,11],ymm4[12],ymm12[13,14,15]
7001 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
7002 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
7003 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm12
7004 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm14[2,2,2,2,6,6,6,6]
7005 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm8[2],ymm12[3,4],ymm8[5],ymm12[6,7,8,9],ymm8[10],ymm12[11,12],ymm8[13],ymm12[14,15]
7006 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
7007 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm20, %zmm12
7008 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
7009 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[3,3,3,3,7,7,7,7]
7010 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7,8,9],ymm8[10],ymm4[11,12],ymm8[13],ymm4[14,15]
7011 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm13, %ymm8
7012 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,2,2,3,5,6,6,7]
7013 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7,8,9],ymm8[10],ymm15[11,12],ymm8[13],ymm15[14,15]
7014 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7015 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
7016 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7017 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
7018 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
7019 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0],xmm8[1],xmm14[2,3],xmm8[4],xmm14[5,6],xmm8[7]
7020 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
7021 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
7022 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm8
7023 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm0
7024 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
7025 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
7026 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
7027 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
7028 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm0
7029 ; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
7030 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm14, %zmm3
7031 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,3,2,10,10,10,11]
7032 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm15
7033 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7034 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm4
7035 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7036 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
7037 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
7038 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
7039 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,5,7]
7040 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
7041 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm6
7042 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
7043 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm24 ^ (zmm16 & (zmm12 ^ zmm24))
7044 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm12))
7045 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5
7046 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [4,5,2,2,6,6,6,6,30,31,27,27,31,31,30,31]
7047 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm7
7048 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm15))
7049 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
7050 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm26 ^ (zmm2 & (zmm8 ^ zmm26))
7051 ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12
7052 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm13
7053 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm12, %zmm12
7054 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
7055 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm8))
7056 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
7057 ; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
7058 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
7059 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
7060 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[2,2,2,2,6,6,6,6]
7061 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3],ymm3[4],ymm8[5,6,7,8],ymm3[9],ymm8[10,11],ymm3[12],ymm8[13,14,15]
7062 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
7063 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
7064 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
7065 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm18[0,0,1,3]
7066 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
7067 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,2,2,3]
7068 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
7069 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
7070 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[1,1,1,1,5,5,5,5]
7071 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7,8,9],ymm4[10],ymm15[11,12],ymm4[13],ymm15[14,15]
7072 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm4
7073 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
7074 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [6,0,0,0,7,0,0,7]
7075 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm3, %ymm3
7076 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
7077 ; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm22, %zmm5
7078 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm31))
7079 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4))
7080 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm4 # 32-byte Folded Reload
7081 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm6 # 32-byte Folded Reload
7082 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm4 ^ (zmm2 & (zmm6 ^ zmm4))
7083 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm25 ^ (zmm16 & (zmm21 ^ zmm25))
7084 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
7085 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm30
7086 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
7087 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7088 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm2, %zmm2
7089 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
7090 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm4, %zmm4
7091 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm2))
7092 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm9 & mem)
7093 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
7094 ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm2
7095 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
7096 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm23))
7097 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm21))
7098 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3
7099 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4
7100 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
7101 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
7102 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
7103 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7104 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rax)
7105 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
7106 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
7107 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
7108 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax)
7109 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rax)
7110 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
7111 ; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108
7112 ; AVX512DQ-FCP-NEXT: vzeroupper
7113 ; AVX512DQ-FCP-NEXT: retq
7115 ; AVX512BW-LABEL: store_i16_stride7_vf32:
7116 ; AVX512BW: # %bb.0:
7117 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7118 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7119 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3
7120 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm4
7121 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm6
7122 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm7
7123 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm1
7124 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm2
7125 ; AVX512BW-NEXT: vmovdqa64 (%r10), %zmm0
7126 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
7127 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7128 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
7129 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
7130 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7131 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7132 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830
7133 ; AVX512BW-NEXT: kmovd %ecx, %k1
7134 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1}
7135 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
7136 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7137 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
7138 ; AVX512BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306
7139 ; AVX512BW-NEXT: kmovd %ecx, %k2
7140 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2}
7141 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
7142 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5
7143 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
7144 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7145 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7146 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
7147 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7148 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8
7149 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
7150 ; AVX512BW-NEXT: kmovd %ecx, %k2
7151 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
7152 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
7153 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7154 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
7155 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
7156 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10
7157 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
7158 ; AVX512BW-NEXT: kmovd %ecx, %k2
7159 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7160 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
7161 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7162 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7163 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
7164 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7165 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9
7166 ; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
7167 ; AVX512BW-NEXT: kmovd %ecx, %k2
7168 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm9 {%k2}
7169 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
7170 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7171 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10
7172 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
7173 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11
7174 ; AVX512BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
7175 ; AVX512BW-NEXT: kmovd %ecx, %k3
7176 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k3}
7177 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
7178 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7179 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm11
7180 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
7181 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7182 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7183 ; AVX512BW-NEXT: movl $405823681, %ecx # imm = 0x183060C1
7184 ; AVX512BW-NEXT: kmovd %ecx, %k3
7185 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3}
7186 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
7187 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7188 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7189 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
7190 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12
7191 ; AVX512BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
7192 ; AVX512BW-NEXT: kmovd %ecx, %k3
7193 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3}
7194 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
7195 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7196 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7197 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
7198 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
7199 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
7200 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm12 {%k1}
7201 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
7202 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7203 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11
7204 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
7205 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13
7206 ; AVX512BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3
7207 ; AVX512BW-NEXT: kmovd %ecx, %k1
7208 ; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1}
7209 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
7210 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7211 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11
7212 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
7213 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
7214 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm13
7215 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2}
7216 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
7217 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7218 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7219 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
7220 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14
7221 ; AVX512BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C
7222 ; AVX512BW-NEXT: kmovd %ecx, %k1
7223 ; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
7224 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
7225 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7226 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7227 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
7228 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7229 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6
7230 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
7231 ; AVX512BW-NEXT: kmovd %ecx, %k1
7232 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1}
7233 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
7234 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7235 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
7236 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
7237 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1
7238 ; AVX512BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1
7239 ; AVX512BW-NEXT: kmovd %ecx, %k1
7240 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1}
7241 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%rax)
7242 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 256(%rax)
7243 ; AVX512BW-NEXT: vmovdqa64 %zmm12, 192(%rax)
7244 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 128(%rax)
7245 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax)
7246 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rax)
7247 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 384(%rax)
7248 ; AVX512BW-NEXT: vzeroupper
7249 ; AVX512BW-NEXT: retq
7251 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf32:
7252 ; AVX512BW-FCP: # %bb.0:
7253 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7254 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7255 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
7256 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
7257 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6
7258 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7
7259 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1
7260 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2
7261 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0
7262 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
7263 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7264 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
7265 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
7266 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7267 ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7268 ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830
7269 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
7270 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1}
7271 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
7272 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7273 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
7274 ; AVX512BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306
7275 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
7276 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2}
7277 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
7278 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5
7279 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
7280 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7281 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7282 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
7283 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7284 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8
7285 ; AVX512BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
7286 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
7287 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
7288 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
7289 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7290 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
7291 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
7292 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10
7293 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
7294 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
7295 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7296 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
7297 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7298 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7299 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
7300 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7301 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm9
7302 ; AVX512BW-FCP-NEXT: movl $202911840, %ecx # imm = 0xC183060
7303 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
7304 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k2}
7305 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
7306 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7307 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10
7308 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
7309 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11
7310 ; AVX512BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
7311 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
7312 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm9 {%k3}
7313 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
7314 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7315 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm11
7316 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
7317 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7318 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7319 ; AVX512BW-FCP-NEXT: movl $405823681, %ecx # imm = 0x183060C1
7320 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
7321 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3}
7322 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
7323 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7324 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7325 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
7326 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12
7327 ; AVX512BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
7328 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
7329 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3}
7330 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
7331 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7332 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7333 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
7334 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
7335 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
7336 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm12 {%k1}
7337 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
7338 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7339 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11
7340 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
7341 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13
7342 ; AVX512BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3
7343 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
7344 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1}
7345 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
7346 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7347 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm11
7348 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
7349 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
7350 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm13
7351 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2}
7352 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
7353 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7354 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7355 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
7356 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14
7357 ; AVX512BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C
7358 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
7359 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
7360 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
7361 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7362 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7363 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
7364 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7365 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6
7366 ; AVX512BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
7367 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
7368 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1}
7369 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
7370 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7371 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
7372 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
7373 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1
7374 ; AVX512BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1
7375 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
7376 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1}
7377 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax)
7378 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax)
7379 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax)
7380 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax)
7381 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax)
7382 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
7383 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax)
7384 ; AVX512BW-FCP-NEXT: vzeroupper
7385 ; AVX512BW-FCP-NEXT: retq
7387 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf32:
7388 ; AVX512DQ-BW: # %bb.0:
7389 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7390 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7391 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3
7392 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm4
7393 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm6
7394 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm7
7395 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm1
7396 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm2
7397 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r10), %zmm0
7398 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
7399 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7400 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
7401 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
7402 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7403 ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7404 ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830
7405 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
7406 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1}
7407 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
7408 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7409 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
7410 ; AVX512DQ-BW-NEXT: movl $1623294726, %ecx # imm = 0x60C18306
7411 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
7412 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2}
7413 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
7414 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm5
7415 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
7416 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7417 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7418 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
7419 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7420 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8
7421 ; AVX512DQ-BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
7422 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
7423 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
7424 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
7425 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7426 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
7427 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
7428 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm10
7429 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
7430 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
7431 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7432 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
7433 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7434 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7435 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
7436 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7437 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm9
7438 ; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060
7439 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
7440 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm9 {%k2}
7441 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
7442 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7443 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm10
7444 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
7445 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm10, %zmm11
7446 ; AVX512DQ-BW-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
7447 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
7448 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k3}
7449 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
7450 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7451 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm11
7452 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
7453 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7454 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7455 ; AVX512DQ-BW-NEXT: movl $405823681, %ecx # imm = 0x183060C1
7456 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
7457 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3}
7458 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
7459 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7460 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7461 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
7462 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm12
7463 ; AVX512DQ-BW-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
7464 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
7465 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3}
7466 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
7467 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7468 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7469 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
7470 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
7471 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
7472 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm12 {%k1}
7473 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
7474 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7475 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm11
7476 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
7477 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm13
7478 ; AVX512DQ-BW-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3
7479 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
7480 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1}
7481 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
7482 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7483 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm11
7484 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
7485 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
7486 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm13
7487 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2}
7488 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
7489 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7490 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7491 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
7492 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm11, %zmm14
7493 ; AVX512DQ-BW-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C
7494 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
7495 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
7496 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
7497 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7498 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7499 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
7500 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7501 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6
7502 ; AVX512DQ-BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
7503 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
7504 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1}
7505 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
7506 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7507 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
7508 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
7509 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm3, %zmm1
7510 ; AVX512DQ-BW-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1
7511 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
7512 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1}
7513 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%rax)
7514 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 256(%rax)
7515 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 192(%rax)
7516 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 128(%rax)
7517 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax)
7518 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rax)
7519 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 384(%rax)
7520 ; AVX512DQ-BW-NEXT: vzeroupper
7521 ; AVX512DQ-BW-NEXT: retq
7523 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf32:
7524 ; AVX512DQ-BW-FCP: # %bb.0:
7525 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7526 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7527 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
7528 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm4
7529 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm6
7530 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm7
7531 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm1
7532 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm2
7533 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r10), %zmm0
7534 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
7535 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7536 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5
7537 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
7538 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7539 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
7540 ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830
7541 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
7542 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k1}
7543 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
7544 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
7545 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
7546 ; AVX512DQ-BW-FCP-NEXT: movl $1623294726, %ecx # imm = 0x60C18306
7547 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
7548 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm8 {%k2}
7549 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
7550 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm5
7551 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
7552 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7553 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
7554 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
7555 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
7556 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8
7557 ; AVX512DQ-BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
7558 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
7559 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k2}
7560 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
7561 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7562 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
7563 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
7564 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm10
7565 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
7566 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
7567 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
7568 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
7569 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7570 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7571 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
7572 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7573 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm9
7574 ; AVX512DQ-BW-FCP-NEXT: movl $202911840, %ecx # imm = 0xC183060
7575 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
7576 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k2}
7577 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
7578 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7579 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm10
7580 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm11 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
7581 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm10, %zmm11
7582 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %ecx # imm = 0x70E1C387
7583 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
7584 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm9 {%k3}
7585 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
7586 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7587 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm11
7588 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
7589 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
7590 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
7591 ; AVX512DQ-BW-FCP-NEXT: movl $405823681, %ecx # imm = 0x183060C1
7592 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
7593 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3}
7594 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
7595 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7596 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7597 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
7598 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm12
7599 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %ecx # imm = 0x870E1C38
7600 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
7601 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm10 {%k3}
7602 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
7603 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7604 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7605 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
7606 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
7607 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12
7608 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm12 {%k1}
7609 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
7610 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7611 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm11
7612 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
7613 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm13
7614 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %ecx # imm = 0x3870E1C3
7615 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
7616 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1}
7617 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
7618 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7619 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm11
7620 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
7621 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
7622 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm13
7623 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm13 {%k2}
7624 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
7625 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7626 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm11
7627 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
7628 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm11, %zmm14
7629 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %ecx # imm = 0xC3870E1C
7630 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
7631 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
7632 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
7633 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
7634 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm11
7635 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
7636 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
7637 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6
7638 ; AVX512DQ-BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
7639 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
7640 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm6 {%k1}
7641 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
7642 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
7643 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
7644 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
7645 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm1
7646 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %ecx # imm = 0x1C3870E1
7647 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
7648 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm6 {%k1}
7649 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%rax)
7650 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 256(%rax)
7651 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax)
7652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 128(%rax)
7653 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax)
7654 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
7655 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 384(%rax)
7656 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
7657 ; AVX512DQ-BW-FCP-NEXT: retq
7658 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64
7659 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64
7660 %in.vec2 = load <32 x i16>, ptr %in.vecptr2, align 64
7661 %in.vec3 = load <32 x i16>, ptr %in.vecptr3, align 64
7662 %in.vec4 = load <32 x i16>, ptr %in.vecptr4, align 64
7663 %in.vec5 = load <32 x i16>, ptr %in.vecptr5, align 64
7664 %in.vec6 = load <32 x i16>, ptr %in.vecptr6, align 64
7665 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
7666 %2 = shufflevector <32 x i16> %in.vec2, <32 x i16> %in.vec3, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
7667 %3 = shufflevector <32 x i16> %in.vec4, <32 x i16> %in.vec5, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
7668 %4 = shufflevector <64 x i16> %1, <64 x i16> %2, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
7669 %5 = shufflevector <32 x i16> %in.vec6, <32 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
7670 %6 = shufflevector <64 x i16> %3, <64 x i16> %5, <96 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
7671 %7 = shufflevector <96 x i16> %6, <96 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
7672 %8 = shufflevector <128 x i16> %4, <128 x i16> %7, <224 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223>
7673 %interleaved.vec = shufflevector <224 x i16> %8, <224 x i16> poison, <224 x i32> <i32 0, i32 32, i32 64, i32 96, i32 128, i32 160, i32 192, i32 1, i32 33, i32 65, i32 97, i32 129, i32 161, i32 193, i32 2, i32 34, i32 66, i32 98, i32 130, i32 162, i32 194, i32 3, i32 35, i32 67, i32 99, i32 131, i32 163, i32 195, i32 4, i32 36, i32 68, i32 100, i32 132, i32 164, i32 196, i32 5, i32 37, i32 69, i32 101, i32 133, i32 165, i32 197, i32 6, i32 38, i32 70, i32 102, i32 134, i32 166, i32 198, i32 7, i32 39, i32 71, i32 103, i32 135, i32 167, i32 199, i32 8, i32 40, i32 72, i32 104, i32 136, i32 168, i32 200, i32 9, i32 41, i32 73, i32 105, i32 137, i32 169, i32 201, i32 10, i32 42, i32 74, i32 106, i32 138, i32 170, i32 202, i32 11, i32 43, i32 75, i32 107, i32 139, i32 171, i32 203, i32 12, i32 44, i32 76, i32 108, i32 140, i32 172, i32 204, i32 13, i32 45, i32 77, i32 109, i32 141, i32 173, i32 205, i32 14, i32 46, i32 78, i32 110, i32 142, i32 174, i32 206, i32 15, i32 47, i32 79, i32 111, i32 143, i32 175, i32 207, i32 16, i32 48, i32 80, i32 112, i32 144, i32 176, i32 208, i32 17, i32 49, i32 81, i32 113, i32 145, i32 177, i32 209, i32 18, i32 50, i32 82, i32 114, i32 146, i32 178, i32 210, i32 19, i32 51, i32 83, i32 115, i32 147, i32 179, i32 211, i32 20, i32 52, i32 84, i32 116, i32 148, i32 180, i32 212, i32 21, i32 53, i32 85, i32 117, i32 149, i32 181, i32 213, i32 22, i32 54, i32 86, i32 118, i32 150, i32 182, i32 214, i32 23, i32 55, i32 87, i32 119, i32 151, i32 183, i32 215, i32 24, i32 56, i32 88, i32 120, i32 152, i32 184, i32 216, i32 25, i32 57, i32 89, i32 121, i32 153, i32 185, i32 217, i32 26, i32 58, i32 90, i32 122, i32 154, i32 186, i32 218, i32 27, i32 59, i32 91, i32 123, i32 155, i32 187, i32 219, i32 28, i32 60, i32 92, i32 124, i32 156, i32 188, i32 220, i32 29, i32 61, i32 93, i32 125, i32 157, i32 189, i32 221, i32 30, i32 62, i32 94, i32 126, i32 158, i32 190, i32 222, i32 31, i32 63, i32 95, i32 127, i32 159, i32 191, i32 223>
7674 store <224 x i16> %interleaved.vec, ptr %out.vec, align 64
7678 define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %in.vecptr5, ptr %in.vecptr6, ptr %out.vec) nounwind {
7679 ; SSE-LABEL: store_i16_stride7_vf64:
7681 ; SSE-NEXT: subq $1640, %rsp # imm = 0x668
7682 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
7683 ; SSE-NEXT: movdqa 112(%rdi), %xmm15
7684 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7685 ; SSE-NEXT: movdqa 112(%rsi), %xmm2
7686 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7687 ; SSE-NEXT: movdqa 96(%rdx), %xmm5
7688 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7689 ; SSE-NEXT: movdqa 112(%rdx), %xmm10
7690 ; SSE-NEXT: movdqa 96(%rcx), %xmm12
7691 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7692 ; SSE-NEXT: movdqa 112(%rcx), %xmm11
7693 ; SSE-NEXT: movdqa 112(%r8), %xmm9
7694 ; SSE-NEXT: movdqa 112(%r9), %xmm8
7695 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7696 ; SSE-NEXT: movaps 112(%rax), %xmm7
7697 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
7698 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7699 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,0,65535]
7700 ; SSE-NEXT: movdqa %xmm13, %xmm1
7701 ; SSE-NEXT: pandn %xmm0, %xmm1
7702 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
7703 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7704 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
7705 ; SSE-NEXT: pand %xmm13, %xmm0
7706 ; SSE-NEXT: por %xmm1, %xmm0
7707 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
7708 ; SSE-NEXT: movdqa %xmm15, %xmm1
7709 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7710 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
7711 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
7712 ; SSE-NEXT: pand %xmm3, %xmm0
7713 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[1,1,1,1]
7714 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7715 ; SSE-NEXT: pandn %xmm2, %xmm3
7716 ; SSE-NEXT: por %xmm0, %xmm3
7717 ; SSE-NEXT: movdqa %xmm8, %xmm0
7718 ; SSE-NEXT: psrld $16, %xmm0
7719 ; SSE-NEXT: movdqa %xmm0, %xmm4
7720 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[0,0]
7721 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[0,2]
7722 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
7723 ; SSE-NEXT: andps %xmm1, %xmm4
7724 ; SSE-NEXT: andnps %xmm7, %xmm1
7725 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7726 ; SSE-NEXT: orps %xmm4, %xmm1
7727 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7728 ; SSE-NEXT: movdqa %xmm12, %xmm1
7729 ; SSE-NEXT: psrlq $48, %xmm1
7730 ; SSE-NEXT: movdqa %xmm5, %xmm3
7731 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
7732 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,65535,65535,65535]
7733 ; SSE-NEXT: movdqa %xmm2, %xmm1
7734 ; SSE-NEXT: pandn %xmm3, %xmm1
7735 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
7736 ; SSE-NEXT: movdqa %xmm4, (%rsp) # 16-byte Spill
7737 ; SSE-NEXT: movdqa 96(%rsi), %xmm3
7738 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7739 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
7740 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7741 ; SSE-NEXT: movdqa %xmm4, %xmm3
7742 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7743 ; SSE-NEXT: por %xmm1, %xmm3
7744 ; SSE-NEXT: movdqa 96(%r8), %xmm1
7745 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7746 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
7747 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535]
7748 ; SSE-NEXT: movdqa %xmm5, %xmm4
7749 ; SSE-NEXT: movdqa %xmm5, %xmm14
7750 ; SSE-NEXT: pandn %xmm1, %xmm4
7751 ; SSE-NEXT: por %xmm3, %xmm4
7752 ; SSE-NEXT: movdqa 96(%r9), %xmm1
7753 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7754 ; SSE-NEXT: psrld $16, %xmm1
7755 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm4[2,0]
7756 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,0]
7757 ; SSE-NEXT: movdqa 96(%rax), %xmm1
7758 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7759 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,65535,65535,0]
7760 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
7761 ; SSE-NEXT: movdqa %xmm5, %xmm4
7762 ; SSE-NEXT: pandn %xmm1, %xmm4
7763 ; SSE-NEXT: andps %xmm5, %xmm3
7764 ; SSE-NEXT: por %xmm3, %xmm4
7765 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7766 ; SSE-NEXT: movdqa %xmm11, %xmm1
7767 ; SSE-NEXT: psrlq $48, %xmm1
7768 ; SSE-NEXT: movdqa %xmm10, %xmm3
7769 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm1[1]
7770 ; SSE-NEXT: movdqa %xmm2, %xmm1
7771 ; SSE-NEXT: pandn %xmm3, %xmm1
7772 ; SSE-NEXT: movdqa %xmm15, %xmm3
7773 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7774 ; SSE-NEXT: por %xmm1, %xmm3
7775 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,3,3]
7776 ; SSE-NEXT: movdqa %xmm14, %xmm4
7777 ; SSE-NEXT: pandn %xmm1, %xmm4
7778 ; SSE-NEXT: por %xmm3, %xmm4
7779 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm4[2,0]
7780 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,0]
7781 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
7782 ; SSE-NEXT: movdqa %xmm5, %xmm1
7783 ; SSE-NEXT: pandn %xmm0, %xmm1
7784 ; SSE-NEXT: andps %xmm5, %xmm3
7785 ; SSE-NEXT: por %xmm3, %xmm1
7786 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7787 ; SSE-NEXT: movdqa (%rax), %xmm11
7788 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1]
7789 ; SSE-NEXT: movdqa %xmm14, %xmm1
7790 ; SSE-NEXT: pandn %xmm0, %xmm1
7791 ; SSE-NEXT: movdqa (%r8), %xmm8
7792 ; SSE-NEXT: movdqa (%r9), %xmm7
7793 ; SSE-NEXT: movdqa %xmm8, %xmm3
7794 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7795 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
7796 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7797 ; SSE-NEXT: movdqa %xmm3, %xmm0
7798 ; SSE-NEXT: movdqa %xmm3, %xmm6
7799 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7800 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
7801 ; SSE-NEXT: pand %xmm14, %xmm0
7802 ; SSE-NEXT: por %xmm1, %xmm0
7803 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
7804 ; SSE-NEXT: movdqa %xmm2, %xmm1
7805 ; SSE-NEXT: movdqa %xmm2, %xmm10
7806 ; SSE-NEXT: pandn %xmm0, %xmm1
7807 ; SSE-NEXT: movdqa (%rcx), %xmm5
7808 ; SSE-NEXT: movdqa %xmm5, %xmm0
7809 ; SSE-NEXT: psrld $16, %xmm0
7810 ; SSE-NEXT: movdqa (%rdx), %xmm2
7811 ; SSE-NEXT: movdqa %xmm2, %xmm3
7812 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
7813 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535]
7814 ; SSE-NEXT: movdqa %xmm0, %xmm4
7815 ; SSE-NEXT: movdqa %xmm0, %xmm12
7816 ; SSE-NEXT: pandn %xmm3, %xmm4
7817 ; SSE-NEXT: movdqa (%rdi), %xmm9
7818 ; SSE-NEXT: movdqa (%rsi), %xmm0
7819 ; SSE-NEXT: movdqa %xmm0, %xmm3
7820 ; SSE-NEXT: movdqa %xmm0, %xmm15
7821 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7822 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
7823 ; SSE-NEXT: movdqa %xmm9, %xmm0
7824 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7825 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
7826 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
7827 ; SSE-NEXT: pand %xmm12, %xmm3
7828 ; SSE-NEXT: movdqa %xmm12, %xmm9
7829 ; SSE-NEXT: por %xmm4, %xmm3
7830 ; SSE-NEXT: pand %xmm10, %xmm3
7831 ; SSE-NEXT: por %xmm1, %xmm3
7832 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7833 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
7834 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7835 ; SSE-NEXT: pandn %xmm11, %xmm1
7836 ; SSE-NEXT: movdqa %xmm6, %xmm3
7837 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7838 ; SSE-NEXT: por %xmm1, %xmm3
7839 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,0,65535,65535,65535,65535]
7840 ; SSE-NEXT: movdqa %xmm12, %xmm1
7841 ; SSE-NEXT: pandn %xmm3, %xmm1
7842 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7843 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
7844 ; SSE-NEXT: movdqa %xmm13, %xmm4
7845 ; SSE-NEXT: pandn %xmm3, %xmm4
7846 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,4,4]
7847 ; SSE-NEXT: pand %xmm13, %xmm3
7848 ; SSE-NEXT: por %xmm4, %xmm3
7849 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
7850 ; SSE-NEXT: movdqa %xmm0, %xmm4
7851 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7852 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
7853 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[3,3,3,3,4,5,6,7]
7854 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7855 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
7856 ; SSE-NEXT: andps %xmm12, %xmm3
7857 ; SSE-NEXT: orps %xmm1, %xmm3
7858 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7859 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
7860 ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,65535,65535,65535,65535]
7861 ; SSE-NEXT: movdqa %xmm13, %xmm3
7862 ; SSE-NEXT: pandn %xmm1, %xmm3
7863 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
7864 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7]
7865 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
7866 ; SSE-NEXT: pand %xmm13, %xmm1
7867 ; SSE-NEXT: movdqa %xmm13, %xmm15
7868 ; SSE-NEXT: por %xmm3, %xmm1
7869 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
7870 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7871 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,3]
7872 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
7873 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7874 ; SSE-NEXT: movdqa 16(%rax), %xmm11
7875 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,1]
7876 ; SSE-NEXT: movdqa %xmm14, %xmm2
7877 ; SSE-NEXT: movdqa %xmm14, %xmm3
7878 ; SSE-NEXT: pandn %xmm1, %xmm3
7879 ; SSE-NEXT: movdqa 16(%r8), %xmm14
7880 ; SSE-NEXT: movdqa 16(%r9), %xmm13
7881 ; SSE-NEXT: movdqa %xmm14, %xmm5
7882 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7883 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
7884 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7885 ; SSE-NEXT: movdqa %xmm5, %xmm1
7886 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7887 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
7888 ; SSE-NEXT: pand %xmm2, %xmm1
7889 ; SSE-NEXT: por %xmm3, %xmm1
7890 ; SSE-NEXT: movdqa %xmm10, %xmm3
7891 ; SSE-NEXT: pandn %xmm1, %xmm3
7892 ; SSE-NEXT: movdqa 16(%rcx), %xmm8
7893 ; SSE-NEXT: movdqa %xmm8, %xmm1
7894 ; SSE-NEXT: psrld $16, %xmm1
7895 ; SSE-NEXT: movdqa 16(%rdx), %xmm7
7896 ; SSE-NEXT: movdqa %xmm7, %xmm4
7897 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7898 ; SSE-NEXT: movdqa %xmm9, %xmm2
7899 ; SSE-NEXT: movdqa %xmm9, %xmm1
7900 ; SSE-NEXT: pandn %xmm4, %xmm1
7901 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
7902 ; SSE-NEXT: movdqa 16(%rsi), %xmm6
7903 ; SSE-NEXT: movdqa %xmm6, %xmm4
7904 ; SSE-NEXT: movdqa %xmm6, %xmm9
7905 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7906 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
7907 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7908 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
7909 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
7910 ; SSE-NEXT: pand %xmm2, %xmm4
7911 ; SSE-NEXT: por %xmm1, %xmm4
7912 ; SSE-NEXT: pand %xmm10, %xmm4
7913 ; SSE-NEXT: movdqa %xmm10, %xmm6
7914 ; SSE-NEXT: por %xmm3, %xmm4
7915 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7916 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,65535,65535,65535,65535]
7917 ; SSE-NEXT: movdqa %xmm10, %xmm1
7918 ; SSE-NEXT: pandn %xmm11, %xmm1
7919 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7920 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7921 ; SSE-NEXT: por %xmm1, %xmm5
7922 ; SSE-NEXT: movdqa %xmm12, %xmm1
7923 ; SSE-NEXT: pandn %xmm5, %xmm1
7924 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7925 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2]
7926 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535]
7927 ; SSE-NEXT: movdqa %xmm5, %xmm4
7928 ; SSE-NEXT: pandn %xmm3, %xmm4
7929 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,4,4,4]
7930 ; SSE-NEXT: pand %xmm5, %xmm3
7931 ; SSE-NEXT: por %xmm4, %xmm3
7932 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
7933 ; SSE-NEXT: movdqa %xmm0, %xmm4
7934 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7935 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
7936 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
7937 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7938 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2]
7939 ; SSE-NEXT: andps %xmm12, %xmm3
7940 ; SSE-NEXT: orps %xmm1, %xmm3
7941 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7942 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,3,2,3]
7943 ; SSE-NEXT: movdqa %xmm15, %xmm3
7944 ; SSE-NEXT: pandn %xmm1, %xmm3
7945 ; SSE-NEXT: movdqa %xmm13, %xmm1
7946 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
7947 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
7948 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
7949 ; SSE-NEXT: pand %xmm15, %xmm1
7950 ; SSE-NEXT: por %xmm3, %xmm1
7951 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
7952 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7953 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,3]
7954 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
7955 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7956 ; SSE-NEXT: movdqa 32(%rax), %xmm7
7957 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
7958 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535]
7959 ; SSE-NEXT: movdqa %xmm2, %xmm3
7960 ; SSE-NEXT: pandn %xmm1, %xmm3
7961 ; SSE-NEXT: movdqa 32(%r8), %xmm14
7962 ; SSE-NEXT: movdqa 32(%r9), %xmm13
7963 ; SSE-NEXT: movdqa %xmm14, %xmm0
7964 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7965 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
7966 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7967 ; SSE-NEXT: movdqa %xmm0, %xmm1
7968 ; SSE-NEXT: movdqa %xmm0, %xmm11
7969 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7970 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
7971 ; SSE-NEXT: pand %xmm2, %xmm1
7972 ; SSE-NEXT: por %xmm3, %xmm1
7973 ; SSE-NEXT: movdqa %xmm6, %xmm8
7974 ; SSE-NEXT: movdqa %xmm6, %xmm3
7975 ; SSE-NEXT: pandn %xmm1, %xmm3
7976 ; SSE-NEXT: movdqa 32(%rcx), %xmm2
7977 ; SSE-NEXT: movdqa %xmm2, %xmm1
7978 ; SSE-NEXT: psrld $16, %xmm1
7979 ; SSE-NEXT: movdqa 32(%rdx), %xmm5
7980 ; SSE-NEXT: movdqa %xmm5, %xmm4
7981 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
7982 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535]
7983 ; SSE-NEXT: movdqa %xmm12, %xmm1
7984 ; SSE-NEXT: pandn %xmm4, %xmm1
7985 ; SSE-NEXT: movdqa 32(%rdi), %xmm9
7986 ; SSE-NEXT: movdqa 32(%rsi), %xmm0
7987 ; SSE-NEXT: movdqa %xmm0, %xmm4
7988 ; SSE-NEXT: movdqa %xmm0, %xmm6
7989 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7990 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
7991 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7992 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
7993 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4]
7994 ; SSE-NEXT: pand %xmm12, %xmm0
7995 ; SSE-NEXT: por %xmm1, %xmm0
7996 ; SSE-NEXT: pand %xmm8, %xmm0
7997 ; SSE-NEXT: por %xmm3, %xmm0
7998 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7999 ; SSE-NEXT: movdqa %xmm10, %xmm1
8000 ; SSE-NEXT: movdqa %xmm10, %xmm15
8001 ; SSE-NEXT: pandn %xmm7, %xmm1
8002 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8003 ; SSE-NEXT: movdqa %xmm11, %xmm3
8004 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8005 ; SSE-NEXT: por %xmm1, %xmm3
8006 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,0,0,65535,65535,65535,65535]
8007 ; SSE-NEXT: movdqa %xmm10, %xmm1
8008 ; SSE-NEXT: pandn %xmm3, %xmm1
8009 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,2,2,2]
8010 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8011 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
8012 ; SSE-NEXT: movdqa %xmm0, %xmm4
8013 ; SSE-NEXT: pandn %xmm3, %xmm4
8014 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,4,4]
8015 ; SSE-NEXT: pand %xmm0, %xmm3
8016 ; SSE-NEXT: por %xmm4, %xmm3
8017 ; SSE-NEXT: movdqa %xmm9, %xmm0
8018 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8019 ; SSE-NEXT: movdqa %xmm0, %xmm4
8020 ; SSE-NEXT: movdqa %xmm0, %xmm11
8021 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8022 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
8023 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[3,3,3,3,4,5,6,7]
8024 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8025 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2]
8026 ; SSE-NEXT: andps %xmm10, %xmm0
8027 ; SSE-NEXT: orps %xmm1, %xmm0
8028 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8029 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
8030 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
8031 ; SSE-NEXT: movdqa %xmm0, %xmm3
8032 ; SSE-NEXT: pandn %xmm1, %xmm3
8033 ; SSE-NEXT: movdqa %xmm13, %xmm1
8034 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
8035 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
8036 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
8037 ; SSE-NEXT: pand %xmm0, %xmm1
8038 ; SSE-NEXT: por %xmm3, %xmm1
8039 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
8040 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8041 ; SSE-NEXT: movdqa %xmm11, %xmm0
8042 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,3]
8043 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
8044 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8045 ; SSE-NEXT: movdqa 48(%rax), %xmm5
8046 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
8047 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,65535,65535]
8048 ; SSE-NEXT: movdqa %xmm14, %xmm3
8049 ; SSE-NEXT: pandn %xmm1, %xmm3
8050 ; SSE-NEXT: movdqa 48(%r8), %xmm0
8051 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8052 ; SSE-NEXT: movdqa 48(%r9), %xmm12
8053 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
8054 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8055 ; SSE-NEXT: movdqa %xmm0, %xmm1
8056 ; SSE-NEXT: movdqa %xmm0, %xmm11
8057 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8058 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8059 ; SSE-NEXT: pand %xmm14, %xmm1
8060 ; SSE-NEXT: por %xmm3, %xmm1
8061 ; SSE-NEXT: movdqa %xmm8, %xmm7
8062 ; SSE-NEXT: movdqa %xmm8, %xmm3
8063 ; SSE-NEXT: pandn %xmm1, %xmm3
8064 ; SSE-NEXT: movdqa 48(%rcx), %xmm0
8065 ; SSE-NEXT: movdqa %xmm0, %xmm1
8066 ; SSE-NEXT: movdqa %xmm0, %xmm9
8067 ; SSE-NEXT: psrld $16, %xmm1
8068 ; SSE-NEXT: movdqa 48(%rdx), %xmm0
8069 ; SSE-NEXT: movdqa %xmm0, %xmm4
8070 ; SSE-NEXT: movdqa %xmm0, %xmm10
8071 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
8072 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,65535,65535,65535,65535,65535]
8073 ; SSE-NEXT: movdqa %xmm2, %xmm1
8074 ; SSE-NEXT: pandn %xmm4, %xmm1
8075 ; SSE-NEXT: movdqa 48(%rdi), %xmm13
8076 ; SSE-NEXT: movdqa 48(%rsi), %xmm0
8077 ; SSE-NEXT: movdqa %xmm0, %xmm4
8078 ; SSE-NEXT: movdqa %xmm0, %xmm8
8079 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8080 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
8081 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8082 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
8083 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4]
8084 ; SSE-NEXT: pand %xmm2, %xmm0
8085 ; SSE-NEXT: por %xmm1, %xmm0
8086 ; SSE-NEXT: pand %xmm7, %xmm0
8087 ; SSE-NEXT: por %xmm3, %xmm0
8088 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8089 ; SSE-NEXT: pandn %xmm5, %xmm15
8090 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8091 ; SSE-NEXT: movdqa %xmm11, %xmm3
8092 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8093 ; SSE-NEXT: por %xmm15, %xmm3
8094 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,0,0,65535,65535,65535,65535]
8095 ; SSE-NEXT: movdqa %xmm2, %xmm1
8096 ; SSE-NEXT: pandn %xmm3, %xmm1
8097 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8098 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,2,2,2]
8099 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
8100 ; SSE-NEXT: movdqa %xmm0, %xmm4
8101 ; SSE-NEXT: pandn %xmm3, %xmm4
8102 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,4,4,4,4]
8103 ; SSE-NEXT: pand %xmm0, %xmm3
8104 ; SSE-NEXT: por %xmm4, %xmm3
8105 ; SSE-NEXT: movdqa %xmm13, %xmm0
8106 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
8107 ; SSE-NEXT: movdqa %xmm0, %xmm4
8108 ; SSE-NEXT: movdqa %xmm0, %xmm6
8109 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8110 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
8111 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[3,3,3,3,4,5,6,7]
8112 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8113 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2]
8114 ; SSE-NEXT: andps %xmm2, %xmm0
8115 ; SSE-NEXT: orps %xmm1, %xmm0
8116 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8117 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
8118 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
8119 ; SSE-NEXT: movdqa %xmm0, %xmm3
8120 ; SSE-NEXT: pandn %xmm1, %xmm3
8121 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8122 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
8123 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7]
8124 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
8125 ; SSE-NEXT: pand %xmm0, %xmm1
8126 ; SSE-NEXT: por %xmm3, %xmm1
8127 ; SSE-NEXT: movdqa %xmm10, %xmm3
8128 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
8129 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8130 ; SSE-NEXT: movdqa %xmm6, %xmm0
8131 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3]
8132 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
8133 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8134 ; SSE-NEXT: movdqa 64(%rax), %xmm5
8135 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
8136 ; SSE-NEXT: movdqa %xmm14, %xmm3
8137 ; SSE-NEXT: pandn %xmm1, %xmm3
8138 ; SSE-NEXT: movdqa 64(%r8), %xmm1
8139 ; SSE-NEXT: movdqa 64(%r9), %xmm10
8140 ; SSE-NEXT: movdqa %xmm1, %xmm0
8141 ; SSE-NEXT: movdqa %xmm1, %xmm7
8142 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8143 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
8144 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8145 ; SSE-NEXT: movdqa %xmm0, %xmm1
8146 ; SSE-NEXT: movdqa %xmm0, %xmm2
8147 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8148 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8149 ; SSE-NEXT: pand %xmm14, %xmm1
8150 ; SSE-NEXT: por %xmm3, %xmm1
8151 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,0,0,0,65535,65535]
8152 ; SSE-NEXT: movdqa %xmm14, %xmm3
8153 ; SSE-NEXT: pandn %xmm1, %xmm3
8154 ; SSE-NEXT: movdqa 64(%rcx), %xmm0
8155 ; SSE-NEXT: movdqa %xmm0, %xmm1
8156 ; SSE-NEXT: movdqa %xmm0, %xmm11
8157 ; SSE-NEXT: psrld $16, %xmm1
8158 ; SSE-NEXT: movdqa 64(%rdx), %xmm0
8159 ; SSE-NEXT: movdqa %xmm0, %xmm4
8160 ; SSE-NEXT: movdqa %xmm0, %xmm13
8161 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
8162 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,0,0,65535,65535,65535,65535,65535]
8163 ; SSE-NEXT: movdqa %xmm12, %xmm1
8164 ; SSE-NEXT: pandn %xmm4, %xmm1
8165 ; SSE-NEXT: movdqa 64(%rdi), %xmm8
8166 ; SSE-NEXT: movdqa 64(%rsi), %xmm0
8167 ; SSE-NEXT: movdqa %xmm0, %xmm4
8168 ; SSE-NEXT: movdqa %xmm0, %xmm6
8169 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8170 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
8171 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8172 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
8173 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4]
8174 ; SSE-NEXT: pand %xmm12, %xmm0
8175 ; SSE-NEXT: por %xmm1, %xmm0
8176 ; SSE-NEXT: pand %xmm14, %xmm0
8177 ; SSE-NEXT: por %xmm3, %xmm0
8178 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8179 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,0,65535,65535,65535,65535]
8180 ; SSE-NEXT: movdqa %xmm15, %xmm1
8181 ; SSE-NEXT: pandn %xmm5, %xmm1
8182 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8183 ; SSE-NEXT: movdqa %xmm2, %xmm3
8184 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8185 ; SSE-NEXT: por %xmm1, %xmm3
8186 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535]
8187 ; SSE-NEXT: movdqa %xmm9, %xmm1
8188 ; SSE-NEXT: pandn %xmm3, %xmm1
8189 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[2,2,2,2]
8190 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8191 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
8192 ; SSE-NEXT: movdqa %xmm0, %xmm4
8193 ; SSE-NEXT: pandn %xmm3, %xmm4
8194 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,4,4]
8195 ; SSE-NEXT: pand %xmm0, %xmm3
8196 ; SSE-NEXT: por %xmm4, %xmm3
8197 ; SSE-NEXT: movdqa %xmm8, %xmm0
8198 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8199 ; SSE-NEXT: movdqa %xmm0, %xmm4
8200 ; SSE-NEXT: movdqa %xmm0, %xmm2
8201 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8202 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
8203 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[3,3,3,3,4,5,6,7]
8204 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8205 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2]
8206 ; SSE-NEXT: andps %xmm9, %xmm0
8207 ; SSE-NEXT: orps %xmm1, %xmm0
8208 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8209 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
8210 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
8211 ; SSE-NEXT: movdqa %xmm0, %xmm3
8212 ; SSE-NEXT: pandn %xmm1, %xmm3
8213 ; SSE-NEXT: movdqa %xmm10, %xmm1
8214 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
8215 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
8216 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
8217 ; SSE-NEXT: pand %xmm0, %xmm1
8218 ; SSE-NEXT: por %xmm3, %xmm1
8219 ; SSE-NEXT: movdqa %xmm13, %xmm3
8220 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
8221 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8222 ; SSE-NEXT: movdqa %xmm2, %xmm0
8223 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,3]
8224 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
8225 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8226 ; SSE-NEXT: movdqa 80(%rax), %xmm9
8227 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
8228 ; SSE-NEXT: movdqa {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,65535,65535]
8229 ; SSE-NEXT: movdqa %xmm12, %xmm3
8230 ; SSE-NEXT: pandn %xmm1, %xmm3
8231 ; SSE-NEXT: movdqa 80(%r8), %xmm1
8232 ; SSE-NEXT: movdqa 80(%r9), %xmm11
8233 ; SSE-NEXT: movdqa %xmm1, %xmm8
8234 ; SSE-NEXT: movdqa %xmm1, %xmm10
8235 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8236 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
8237 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8238 ; SSE-NEXT: movdqa %xmm8, %xmm1
8239 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8240 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8241 ; SSE-NEXT: pand %xmm12, %xmm1
8242 ; SSE-NEXT: por %xmm3, %xmm1
8243 ; SSE-NEXT: movdqa %xmm14, %xmm3
8244 ; SSE-NEXT: pandn %xmm1, %xmm3
8245 ; SSE-NEXT: movdqa 80(%rcx), %xmm0
8246 ; SSE-NEXT: movdqa %xmm0, %xmm1
8247 ; SSE-NEXT: movdqa %xmm0, %xmm13
8248 ; SSE-NEXT: psrld $16, %xmm1
8249 ; SSE-NEXT: movdqa 80(%rdx), %xmm2
8250 ; SSE-NEXT: movdqa %xmm2, %xmm4
8251 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
8252 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,0,65535,65535,65535,65535,65535]
8253 ; SSE-NEXT: movdqa %xmm5, %xmm1
8254 ; SSE-NEXT: pandn %xmm4, %xmm1
8255 ; SSE-NEXT: movdqa 80(%rdi), %xmm7
8256 ; SSE-NEXT: movdqa 80(%rsi), %xmm0
8257 ; SSE-NEXT: movdqa %xmm0, %xmm4
8258 ; SSE-NEXT: movdqa %xmm0, %xmm6
8259 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8260 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
8261 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8262 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
8263 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,5,4]
8264 ; SSE-NEXT: pand %xmm5, %xmm0
8265 ; SSE-NEXT: por %xmm1, %xmm0
8266 ; SSE-NEXT: pand %xmm14, %xmm0
8267 ; SSE-NEXT: por %xmm3, %xmm0
8268 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8269 ; SSE-NEXT: movdqa %xmm15, %xmm1
8270 ; SSE-NEXT: movdqa %xmm9, %xmm5
8271 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8272 ; SSE-NEXT: pandn %xmm9, %xmm1
8273 ; SSE-NEXT: psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8274 ; SSE-NEXT: por %xmm1, %xmm8
8275 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,0,0,0,65535,65535,65535,65535]
8276 ; SSE-NEXT: movdqa %xmm9, %xmm1
8277 ; SSE-NEXT: pandn %xmm8, %xmm1
8278 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
8279 ; SSE-NEXT: movdqa %xmm2, %xmm8
8280 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8281 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,65535,0,65535]
8282 ; SSE-NEXT: movdqa %xmm0, %xmm4
8283 ; SSE-NEXT: pandn %xmm3, %xmm4
8284 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm13[0,1,2,3,4,4,4,4]
8285 ; SSE-NEXT: pand %xmm0, %xmm3
8286 ; SSE-NEXT: por %xmm4, %xmm3
8287 ; SSE-NEXT: movdqa %xmm7, %xmm0
8288 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
8289 ; SSE-NEXT: movdqa %xmm0, %xmm4
8290 ; SSE-NEXT: movdqa %xmm0, %xmm2
8291 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8292 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[3,3]
8293 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[3,3,3,3,4,5,6,7]
8294 ; SSE-NEXT: movdqa %xmm13, %xmm6
8295 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8296 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,2]
8297 ; SSE-NEXT: andps %xmm9, %xmm0
8298 ; SSE-NEXT: movaps %xmm9, %xmm13
8299 ; SSE-NEXT: orps %xmm1, %xmm0
8300 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8301 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
8302 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,65535,65535,65535,65535]
8303 ; SSE-NEXT: movdqa %xmm0, %xmm3
8304 ; SSE-NEXT: pandn %xmm1, %xmm3
8305 ; SSE-NEXT: movdqa %xmm11, %xmm1
8306 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
8307 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
8308 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
8309 ; SSE-NEXT: pand %xmm0, %xmm1
8310 ; SSE-NEXT: movdqa %xmm0, %xmm11
8311 ; SSE-NEXT: por %xmm3, %xmm1
8312 ; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
8313 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8314 ; SSE-NEXT: movdqa %xmm2, %xmm0
8315 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm8[2,3]
8316 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[0,3]
8317 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8318 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8319 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
8320 ; SSE-NEXT: movdqa %xmm12, %xmm3
8321 ; SSE-NEXT: pandn %xmm1, %xmm3
8322 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8323 ; SSE-NEXT: movdqa %xmm8, %xmm0
8324 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8325 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
8326 ; SSE-NEXT: movdqa %xmm0, %xmm1
8327 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8328 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8329 ; SSE-NEXT: pand %xmm12, %xmm1
8330 ; SSE-NEXT: por %xmm3, %xmm1
8331 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8332 ; SSE-NEXT: movdqa %xmm2, %xmm3
8333 ; SSE-NEXT: psrld $16, %xmm3
8334 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8335 ; SSE-NEXT: movdqa %xmm7, %xmm4
8336 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
8337 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,0,0,65535,65535,65535,65535,65535]
8338 ; SSE-NEXT: movdqa %xmm14, %xmm3
8339 ; SSE-NEXT: pandn %xmm4, %xmm3
8340 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8341 ; SSE-NEXT: punpcklwd (%rsp), %xmm4 # 16-byte Folded Reload
8342 ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
8343 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
8344 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
8345 ; SSE-NEXT: pand %xmm14, %xmm4
8346 ; SSE-NEXT: por %xmm3, %xmm4
8347 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,65535,0,0,0,65535,65535]
8348 ; SSE-NEXT: pand %xmm10, %xmm4
8349 ; SSE-NEXT: pandn %xmm1, %xmm10
8350 ; SSE-NEXT: por %xmm4, %xmm10
8351 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8352 ; SSE-NEXT: movdqa %xmm15, %xmm3
8353 ; SSE-NEXT: movdqa %xmm15, %xmm12
8354 ; SSE-NEXT: pandn %xmm5, %xmm3
8355 ; SSE-NEXT: movdqa %xmm5, %xmm10
8356 ; SSE-NEXT: movdqa %xmm0, %xmm1
8357 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8358 ; SSE-NEXT: por %xmm3, %xmm1
8359 ; SSE-NEXT: movdqa %xmm7, %xmm14
8360 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[2,2,2,2]
8361 ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,0,65535]
8362 ; SSE-NEXT: movdqa %xmm9, %xmm4
8363 ; SSE-NEXT: pandn %xmm3, %xmm4
8364 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,4,4]
8365 ; SSE-NEXT: pand %xmm9, %xmm5
8366 ; SSE-NEXT: por %xmm4, %xmm5
8367 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8368 ; SSE-NEXT: movaps %xmm0, %xmm3
8369 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[3,3]
8370 ; SSE-NEXT: movdqa %xmm7, %xmm4
8371 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
8372 ; SSE-NEXT: movdqa %xmm4, %xmm5
8373 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8374 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
8375 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8376 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[3,3,3,3,4,5,6,7]
8377 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2]
8378 ; SSE-NEXT: movaps %xmm13, %xmm3
8379 ; SSE-NEXT: andps %xmm13, %xmm4
8380 ; SSE-NEXT: andnps %xmm1, %xmm3
8381 ; SSE-NEXT: orps %xmm4, %xmm3
8382 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8383 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[2,3,2,3]
8384 ; SSE-NEXT: movdqa %xmm11, %xmm1
8385 ; SSE-NEXT: pandn %xmm3, %xmm1
8386 ; SSE-NEXT: movdqa %xmm8, %xmm2
8387 ; SSE-NEXT: movdqa %xmm6, %xmm3
8388 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
8389 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8390 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
8391 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
8392 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
8393 ; SSE-NEXT: pand %xmm11, %xmm3
8394 ; SSE-NEXT: movdqa %xmm11, %xmm15
8395 ; SSE-NEXT: por %xmm1, %xmm3
8396 ; SSE-NEXT: movaps %xmm0, %xmm1
8397 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm5[2,3]
8398 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm3[0,3]
8399 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8400 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8401 ; SSE-NEXT: movdqa %xmm9, %xmm1
8402 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8403 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
8404 ; SSE-NEXT: movdqa %xmm9, %xmm3
8405 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
8406 ; SSE-NEXT: psrld $16, %xmm4
8407 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
8408 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8409 ; SSE-NEXT: movdqa %xmm13, %xmm4
8410 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8411 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
8412 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
8413 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
8414 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8415 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,0,0,0]
8416 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
8417 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1],xmm4[3,3]
8418 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,0,0,65535,65535,65535,65535,65535]
8419 ; SSE-NEXT: pand %xmm0, %xmm4
8420 ; SSE-NEXT: pandn %xmm3, %xmm0
8421 ; SSE-NEXT: por %xmm4, %xmm0
8422 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8423 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,0,1,1]
8424 ; SSE-NEXT: movdqa %xmm12, %xmm4
8425 ; SSE-NEXT: pandn %xmm3, %xmm4
8426 ; SSE-NEXT: pand %xmm12, %xmm0
8427 ; SSE-NEXT: por %xmm0, %xmm4
8428 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2]
8429 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8430 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
8431 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,65535,65535]
8432 ; SSE-NEXT: movdqa %xmm5, %xmm3
8433 ; SSE-NEXT: pandn %xmm0, %xmm3
8434 ; SSE-NEXT: andps %xmm5, %xmm4
8435 ; SSE-NEXT: por %xmm4, %xmm3
8436 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8437 ; SSE-NEXT: movdqa %xmm11, %xmm4
8438 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[2,2,2,2]
8439 ; SSE-NEXT: movdqa %xmm11, %xmm0
8440 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8441 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0]
8442 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8443 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
8444 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,65535]
8445 ; SSE-NEXT: movdqa %xmm4, %xmm5
8446 ; SSE-NEXT: pandn %xmm1, %xmm5
8447 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8448 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8449 ; SSE-NEXT: pand %xmm4, %xmm1
8450 ; SSE-NEXT: por %xmm5, %xmm1
8451 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,65535,65535,65535,65535,65535,65535,0]
8452 ; SSE-NEXT: movdqa %xmm7, %xmm5
8453 ; SSE-NEXT: pandn %xmm3, %xmm5
8454 ; SSE-NEXT: pand %xmm7, %xmm1
8455 ; SSE-NEXT: por %xmm1, %xmm5
8456 ; SSE-NEXT: movdqa %xmm6, %xmm1
8457 ; SSE-NEXT: movdqa %xmm6, %xmm7
8458 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
8459 ; SSE-NEXT: movdqa %xmm15, %xmm3
8460 ; SSE-NEXT: pandn %xmm1, %xmm3
8461 ; SSE-NEXT: pand %xmm15, %xmm5
8462 ; SSE-NEXT: por %xmm5, %xmm3
8463 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,65535,65535,65535]
8464 ; SSE-NEXT: pand %xmm5, %xmm3
8465 ; SSE-NEXT: movdqa %xmm2, %xmm6
8466 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
8467 ; SSE-NEXT: pandn %xmm1, %xmm5
8468 ; SSE-NEXT: por %xmm3, %xmm5
8469 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8470 ; SSE-NEXT: movdqa %xmm10, %xmm3
8471 ; SSE-NEXT: movdqa %xmm10, %xmm1
8472 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7]
8473 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8474 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
8475 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8476 ; SSE-NEXT: movdqa %xmm13, %xmm3
8477 ; SSE-NEXT: psrld $16, %xmm3
8478 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
8479 ; SSE-NEXT: movdqa %xmm4, %xmm3
8480 ; SSE-NEXT: pandn %xmm1, %xmm3
8481 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5,6,6]
8482 ; SSE-NEXT: pand %xmm4, %xmm1
8483 ; SSE-NEXT: por %xmm3, %xmm1
8484 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm9[2,3]
8485 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
8486 ; SSE-NEXT: andps %xmm12, %xmm0
8487 ; SSE-NEXT: movdqa %xmm7, %xmm1
8488 ; SSE-NEXT: pslld $16, %xmm1
8489 ; SSE-NEXT: pandn %xmm1, %xmm12
8490 ; SSE-NEXT: por %xmm0, %xmm12
8491 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535]
8492 ; SSE-NEXT: pand %xmm1, %xmm12
8493 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,1,1,3]
8494 ; SSE-NEXT: pandn %xmm0, %xmm1
8495 ; SSE-NEXT: por %xmm12, %xmm1
8496 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8497 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8498 ; SSE-NEXT: movdqa %xmm1, %xmm0
8499 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8500 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8501 ; SSE-NEXT: movdqa %xmm1, %xmm13
8502 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8503 ; SSE-NEXT: movdqa %xmm2, %xmm1
8504 ; SSE-NEXT: psrlq $48, %xmm1
8505 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8506 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
8507 ; SSE-NEXT: movdqa %xmm8, %xmm1
8508 ; SSE-NEXT: pandn %xmm0, %xmm1
8509 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8510 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8511 ; SSE-NEXT: por %xmm1, %xmm0
8512 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8513 ; SSE-NEXT: movdqa %xmm3, %xmm2
8514 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8515 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8516 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8517 ; SSE-NEXT: psrld $16, %xmm1
8518 ; SSE-NEXT: movdqa %xmm3, %xmm2
8519 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8520 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8521 ; SSE-NEXT: movdqa %xmm11, %xmm1
8522 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8523 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8524 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8525 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8526 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,0,0]
8527 ; SSE-NEXT: movdqa %xmm5, %xmm2
8528 ; SSE-NEXT: pandn %xmm1, %xmm2
8529 ; SSE-NEXT: pand %xmm5, %xmm0
8530 ; SSE-NEXT: por %xmm0, %xmm2
8531 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8532 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8533 ; SSE-NEXT: movdqa %xmm6, %xmm0
8534 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8535 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
8536 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8537 ; SSE-NEXT: psrlq $48, %xmm1
8538 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8539 ; SSE-NEXT: movdqa %xmm8, %xmm15
8540 ; SSE-NEXT: movdqa %xmm8, %xmm1
8541 ; SSE-NEXT: pandn %xmm0, %xmm1
8542 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8543 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8544 ; SSE-NEXT: por %xmm1, %xmm0
8545 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8546 ; SSE-NEXT: movdqa %xmm3, %xmm2
8547 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8548 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8549 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8550 ; SSE-NEXT: psrld $16, %xmm1
8551 ; SSE-NEXT: movdqa %xmm3, %xmm2
8552 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8553 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8554 ; SSE-NEXT: movdqa %xmm8, %xmm1
8555 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8556 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8557 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8558 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8559 ; SSE-NEXT: movdqa %xmm5, %xmm2
8560 ; SSE-NEXT: pandn %xmm1, %xmm2
8561 ; SSE-NEXT: pand %xmm5, %xmm0
8562 ; SSE-NEXT: por %xmm0, %xmm2
8563 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8564 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8565 ; SSE-NEXT: movdqa %xmm7, %xmm0
8566 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8567 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
8568 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8569 ; SSE-NEXT: psrlq $48, %xmm1
8570 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8571 ; SSE-NEXT: movdqa %xmm15, %xmm1
8572 ; SSE-NEXT: pandn %xmm0, %xmm1
8573 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8574 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8575 ; SSE-NEXT: por %xmm1, %xmm0
8576 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8577 ; SSE-NEXT: movdqa %xmm3, %xmm2
8578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8579 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8580 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8581 ; SSE-NEXT: psrld $16, %xmm1
8582 ; SSE-NEXT: movdqa %xmm3, %xmm2
8583 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8584 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8585 ; SSE-NEXT: movdqa %xmm14, %xmm1
8586 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8587 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8588 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8589 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8590 ; SSE-NEXT: movdqa %xmm5, %xmm2
8591 ; SSE-NEXT: pandn %xmm1, %xmm2
8592 ; SSE-NEXT: pand %xmm5, %xmm0
8593 ; SSE-NEXT: por %xmm0, %xmm2
8594 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8595 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8596 ; SSE-NEXT: movdqa %xmm1, %xmm0
8597 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8598 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8599 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8600 ; SSE-NEXT: movdqa %xmm2, %xmm1
8601 ; SSE-NEXT: psrlq $48, %xmm1
8602 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8603 ; SSE-NEXT: movdqa %xmm15, %xmm1
8604 ; SSE-NEXT: pandn %xmm0, %xmm1
8605 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8606 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8607 ; SSE-NEXT: por %xmm1, %xmm0
8608 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8609 ; SSE-NEXT: movdqa %xmm3, %xmm2
8610 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8611 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8612 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8613 ; SSE-NEXT: psrld $16, %xmm1
8614 ; SSE-NEXT: movdqa %xmm3, %xmm2
8615 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8616 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8617 ; SSE-NEXT: movdqa %xmm3, %xmm1
8618 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8619 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8620 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8621 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8622 ; SSE-NEXT: movdqa %xmm5, %xmm2
8623 ; SSE-NEXT: pandn %xmm1, %xmm2
8624 ; SSE-NEXT: pand %xmm5, %xmm0
8625 ; SSE-NEXT: por %xmm0, %xmm2
8626 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8627 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8628 ; SSE-NEXT: movdqa %xmm1, %xmm0
8629 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8630 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8631 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8632 ; SSE-NEXT: movdqa %xmm2, %xmm1
8633 ; SSE-NEXT: psrlq $48, %xmm1
8634 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8635 ; SSE-NEXT: movdqa %xmm15, %xmm1
8636 ; SSE-NEXT: pandn %xmm0, %xmm1
8637 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8638 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8639 ; SSE-NEXT: por %xmm1, %xmm0
8640 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8641 ; SSE-NEXT: movdqa %xmm10, %xmm2
8642 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8643 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8644 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8645 ; SSE-NEXT: psrld $16, %xmm1
8646 ; SSE-NEXT: movdqa %xmm10, %xmm2
8647 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8648 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8649 ; SSE-NEXT: movdqa %xmm10, %xmm1
8650 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8651 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8652 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8653 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8654 ; SSE-NEXT: movdqa %xmm5, %xmm2
8655 ; SSE-NEXT: pandn %xmm1, %xmm2
8656 ; SSE-NEXT: pand %xmm5, %xmm0
8657 ; SSE-NEXT: por %xmm0, %xmm2
8658 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8659 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8660 ; SSE-NEXT: movdqa %xmm1, %xmm0
8661 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8662 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8663 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8664 ; SSE-NEXT: movdqa %xmm2, %xmm1
8665 ; SSE-NEXT: psrlq $48, %xmm1
8666 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
8667 ; SSE-NEXT: movdqa %xmm15, %xmm1
8668 ; SSE-NEXT: pandn %xmm0, %xmm1
8669 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8670 ; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8671 ; SSE-NEXT: por %xmm1, %xmm0
8672 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8673 ; SSE-NEXT: movdqa %xmm12, %xmm2
8674 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8675 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
8676 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8677 ; SSE-NEXT: psrld $16, %xmm1
8678 ; SSE-NEXT: movdqa %xmm12, %xmm2
8679 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8680 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8681 ; SSE-NEXT: movdqa %xmm15, %xmm1
8682 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
8683 ; SSE-NEXT: pand %xmm5, %xmm0
8684 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
8685 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,1]
8686 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,5,4]
8687 ; SSE-NEXT: pandn %xmm1, %xmm5
8688 ; SSE-NEXT: por %xmm0, %xmm5
8689 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8690 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8691 ; SSE-NEXT: movdqa %xmm1, %xmm0
8692 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8693 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8695 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8696 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8697 ; SSE-NEXT: movdqa %xmm2, %xmm1
8698 ; SSE-NEXT: psrld $16, %xmm1
8699 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8700 ; SSE-NEXT: movdqa %xmm4, %xmm1
8701 ; SSE-NEXT: pandn %xmm0, %xmm1
8702 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5,6,6]
8703 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8704 ; SSE-NEXT: pand %xmm4, %xmm0
8705 ; SSE-NEXT: por %xmm1, %xmm0
8706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8707 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm11[1,1]
8708 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,0,0,0,65535,65535,65535]
8709 ; SSE-NEXT: movaps %xmm5, %xmm2
8710 ; SSE-NEXT: andnps %xmm1, %xmm2
8711 ; SSE-NEXT: pand %xmm5, %xmm0
8712 ; SSE-NEXT: orps %xmm0, %xmm2
8713 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8714 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8715 ; SSE-NEXT: movdqa %xmm1, %xmm0
8716 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8717 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8718 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8719 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8720 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8721 ; SSE-NEXT: movdqa %xmm2, %xmm1
8722 ; SSE-NEXT: psrld $16, %xmm1
8723 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8724 ; SSE-NEXT: movdqa %xmm4, %xmm1
8725 ; SSE-NEXT: pandn %xmm0, %xmm1
8726 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,6,6]
8727 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8728 ; SSE-NEXT: pand %xmm4, %xmm0
8729 ; SSE-NEXT: por %xmm1, %xmm0
8730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8731 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm8[1,1]
8732 ; SSE-NEXT: movaps %xmm8, %xmm12
8733 ; SSE-NEXT: movaps %xmm5, %xmm2
8734 ; SSE-NEXT: andnps %xmm1, %xmm2
8735 ; SSE-NEXT: pand %xmm5, %xmm0
8736 ; SSE-NEXT: orps %xmm0, %xmm2
8737 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8738 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8739 ; SSE-NEXT: movdqa %xmm1, %xmm0
8740 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8741 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8742 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8743 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8744 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8745 ; SSE-NEXT: movdqa %xmm2, %xmm1
8746 ; SSE-NEXT: psrld $16, %xmm1
8747 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8748 ; SSE-NEXT: movdqa %xmm4, %xmm1
8749 ; SSE-NEXT: pandn %xmm0, %xmm1
8750 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5,6,6]
8751 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8752 ; SSE-NEXT: pand %xmm4, %xmm0
8753 ; SSE-NEXT: por %xmm1, %xmm0
8754 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8755 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm14[1,1]
8756 ; SSE-NEXT: movaps %xmm14, %xmm11
8757 ; SSE-NEXT: movaps %xmm5, %xmm2
8758 ; SSE-NEXT: andnps %xmm1, %xmm2
8759 ; SSE-NEXT: pand %xmm5, %xmm0
8760 ; SSE-NEXT: orps %xmm0, %xmm2
8761 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8762 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8763 ; SSE-NEXT: movdqa %xmm1, %xmm0
8764 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8765 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8766 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8767 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8768 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8769 ; SSE-NEXT: movdqa %xmm2, %xmm1
8770 ; SSE-NEXT: psrld $16, %xmm1
8771 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8772 ; SSE-NEXT: movdqa %xmm4, %xmm1
8773 ; SSE-NEXT: pandn %xmm0, %xmm1
8774 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8775 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6]
8776 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8777 ; SSE-NEXT: pand %xmm4, %xmm0
8778 ; SSE-NEXT: por %xmm1, %xmm0
8779 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8780 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1]
8781 ; SSE-NEXT: movaps %xmm3, %xmm14
8782 ; SSE-NEXT: movaps %xmm5, %xmm2
8783 ; SSE-NEXT: andnps %xmm1, %xmm2
8784 ; SSE-NEXT: pand %xmm5, %xmm0
8785 ; SSE-NEXT: orps %xmm0, %xmm2
8786 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8788 ; SSE-NEXT: movdqa %xmm1, %xmm0
8789 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8790 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8791 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8792 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8793 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8794 ; SSE-NEXT: movdqa %xmm2, %xmm1
8795 ; SSE-NEXT: psrld $16, %xmm1
8796 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8797 ; SSE-NEXT: movdqa %xmm4, %xmm1
8798 ; SSE-NEXT: pandn %xmm0, %xmm1
8799 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8800 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6]
8801 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8802 ; SSE-NEXT: pand %xmm4, %xmm0
8803 ; SSE-NEXT: por %xmm1, %xmm0
8804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8805 ; SSE-NEXT: movdqa %xmm10, %xmm13
8806 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm10[1,1]
8807 ; SSE-NEXT: movaps %xmm5, %xmm2
8808 ; SSE-NEXT: andnps %xmm1, %xmm2
8809 ; SSE-NEXT: pand %xmm5, %xmm0
8810 ; SSE-NEXT: orps %xmm0, %xmm2
8811 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8812 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8813 ; SSE-NEXT: movdqa %xmm1, %xmm0
8814 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8815 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8816 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8817 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8818 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8819 ; SSE-NEXT: movdqa %xmm2, %xmm1
8820 ; SSE-NEXT: psrld $16, %xmm1
8821 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8822 ; SSE-NEXT: movdqa %xmm4, %xmm1
8823 ; SSE-NEXT: pandn %xmm0, %xmm1
8824 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8825 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6]
8826 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8827 ; SSE-NEXT: pand %xmm4, %xmm0
8828 ; SSE-NEXT: por %xmm1, %xmm0
8829 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8830 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm15[1,1]
8831 ; SSE-NEXT: movaps %xmm5, %xmm2
8832 ; SSE-NEXT: andnps %xmm1, %xmm2
8833 ; SSE-NEXT: pand %xmm5, %xmm0
8834 ; SSE-NEXT: orps %xmm0, %xmm2
8835 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8836 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
8837 ; SSE-NEXT: movdqa %xmm1, %xmm0
8838 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
8839 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8840 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8841 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8842 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
8843 ; SSE-NEXT: movdqa %xmm2, %xmm1
8844 ; SSE-NEXT: psrld $16, %xmm1
8845 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
8846 ; SSE-NEXT: movdqa %xmm4, %xmm1
8847 ; SSE-NEXT: pandn %xmm0, %xmm1
8848 ; SSE-NEXT: pshufhw $164, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8849 ; SSE-NEXT: # xmm0 = mem[0,1,2,3,4,5,6,6]
8850 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
8851 ; SSE-NEXT: pand %xmm4, %xmm0
8852 ; SSE-NEXT: por %xmm1, %xmm0
8853 ; SSE-NEXT: pand %xmm5, %xmm0
8854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8856 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,2],xmm3[1,1]
8857 ; SSE-NEXT: andnps %xmm1, %xmm5
8858 ; SSE-NEXT: orps %xmm0, %xmm5
8859 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8860 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8861 ; SSE-NEXT: # xmm9 = xmm9[0],mem[0]
8862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8863 ; SSE-NEXT: shufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8864 ; SSE-NEXT: # xmm0 = xmm0[0,1],mem[1,3]
8865 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[0,2]
8866 ; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,65535,65535]
8867 ; SSE-NEXT: andps %xmm2, %xmm9
8868 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8869 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,6,6,7]
8870 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
8871 ; SSE-NEXT: andnps %xmm1, %xmm2
8872 ; SSE-NEXT: orps %xmm9, %xmm2
8873 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,0,65535]
8874 ; SSE-NEXT: andps %xmm5, %xmm2
8875 ; SSE-NEXT: movaps %xmm2, %xmm7
8876 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8877 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
8878 ; SSE-NEXT: andnps %xmm1, %xmm5
8879 ; SSE-NEXT: orps %xmm7, %xmm5
8880 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8882 ; SSE-NEXT: shufps $42, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8883 ; SSE-NEXT: # xmm5 = xmm5[2,2],mem[2,0]
8884 ; SSE-NEXT: movaps {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
8885 ; SSE-NEXT: andps %xmm1, %xmm5
8886 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
8887 ; SSE-NEXT: andnps %xmm0, %xmm1
8888 ; SSE-NEXT: orps %xmm5, %xmm1
8889 ; SSE-NEXT: movaps {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
8890 ; SSE-NEXT: andps %xmm5, %xmm1
8891 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
8892 ; SSE-NEXT: andnps %xmm0, %xmm5
8893 ; SSE-NEXT: orps %xmm1, %xmm5
8894 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8895 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8896 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
8897 ; SSE-NEXT: movdqa %xmm4, %xmm0
8898 ; SSE-NEXT: pandn %xmm1, %xmm0
8899 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8900 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8901 ; SSE-NEXT: pand %xmm4, %xmm1
8902 ; SSE-NEXT: por %xmm0, %xmm1
8903 ; SSE-NEXT: movdqa %xmm1, %xmm0
8904 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,0,65535]
8905 ; SSE-NEXT: movdqa %xmm2, %xmm1
8906 ; SSE-NEXT: pandn %xmm0, %xmm1
8907 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8908 ; SSE-NEXT: movapd %xmm10, %xmm5
8909 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8910 ; SSE-NEXT: # xmm5 = xmm5[1],mem[0]
8911 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8912 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
8913 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1]
8914 ; SSE-NEXT: andps %xmm2, %xmm5
8915 ; SSE-NEXT: orps %xmm1, %xmm5
8916 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8917 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8918 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
8919 ; SSE-NEXT: movdqa %xmm4, %xmm1
8920 ; SSE-NEXT: pandn %xmm0, %xmm1
8921 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8922 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8923 ; SSE-NEXT: pand %xmm4, %xmm0
8924 ; SSE-NEXT: por %xmm1, %xmm0
8925 ; SSE-NEXT: movdqa %xmm2, %xmm1
8926 ; SSE-NEXT: pandn %xmm0, %xmm1
8927 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8928 ; SSE-NEXT: # xmm12 = xmm12[1],mem[0]
8929 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8930 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
8931 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,1]
8932 ; SSE-NEXT: andps %xmm2, %xmm12
8933 ; SSE-NEXT: orps %xmm1, %xmm12
8934 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8935 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
8936 ; SSE-NEXT: movdqa %xmm4, %xmm1
8937 ; SSE-NEXT: pandn %xmm0, %xmm1
8938 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8939 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8940 ; SSE-NEXT: pand %xmm4, %xmm0
8941 ; SSE-NEXT: por %xmm1, %xmm0
8942 ; SSE-NEXT: movdqa %xmm2, %xmm1
8943 ; SSE-NEXT: pandn %xmm0, %xmm1
8944 ; SSE-NEXT: movaps %xmm11, %xmm6
8945 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8946 ; SSE-NEXT: # xmm11 = xmm11[1],mem[0]
8947 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8948 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
8949 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm0[2,1]
8950 ; SSE-NEXT: andps %xmm2, %xmm11
8951 ; SSE-NEXT: orps %xmm1, %xmm11
8952 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8953 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
8954 ; SSE-NEXT: movdqa %xmm4, %xmm1
8955 ; SSE-NEXT: pandn %xmm0, %xmm1
8956 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8957 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8958 ; SSE-NEXT: pand %xmm4, %xmm0
8959 ; SSE-NEXT: por %xmm1, %xmm0
8960 ; SSE-NEXT: movdqa %xmm2, %xmm1
8961 ; SSE-NEXT: pandn %xmm0, %xmm1
8962 ; SSE-NEXT: movaps %xmm14, %xmm15
8963 ; SSE-NEXT: movaps %xmm14, %xmm9
8964 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
8965 ; SSE-NEXT: # xmm9 = xmm9[1],mem[0]
8966 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8967 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
8968 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[2,0],xmm0[2,1]
8969 ; SSE-NEXT: andps %xmm2, %xmm9
8970 ; SSE-NEXT: orps %xmm1, %xmm9
8971 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8972 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
8973 ; SSE-NEXT: movdqa %xmm4, %xmm1
8974 ; SSE-NEXT: pandn %xmm0, %xmm1
8975 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8976 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8977 ; SSE-NEXT: pand %xmm4, %xmm0
8978 ; SSE-NEXT: por %xmm1, %xmm0
8979 ; SSE-NEXT: movdqa %xmm2, %xmm1
8980 ; SSE-NEXT: pandn %xmm0, %xmm1
8981 ; SSE-NEXT: movdqa %xmm13, %xmm7
8982 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8983 ; SSE-NEXT: # xmm7 = xmm7[1],mem[0]
8984 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8985 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
8986 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[2,1]
8987 ; SSE-NEXT: andps %xmm2, %xmm7
8988 ; SSE-NEXT: orps %xmm1, %xmm7
8989 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8990 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
8991 ; SSE-NEXT: movdqa %xmm4, %xmm1
8992 ; SSE-NEXT: pandn %xmm0, %xmm1
8993 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8994 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
8995 ; SSE-NEXT: pand %xmm4, %xmm0
8996 ; SSE-NEXT: por %xmm1, %xmm0
8997 ; SSE-NEXT: movdqa %xmm2, %xmm1
8998 ; SSE-NEXT: pandn %xmm0, %xmm1
8999 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9000 ; SSE-NEXT: movapd %xmm14, %xmm5
9001 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9002 ; SSE-NEXT: # xmm5 = xmm5[1],mem[0]
9003 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9004 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
9005 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm0[2,1]
9006 ; SSE-NEXT: andps %xmm2, %xmm5
9007 ; SSE-NEXT: orps %xmm1, %xmm5
9008 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9009 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
9010 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9011 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9012 ; SSE-NEXT: pand %xmm4, %xmm0
9013 ; SSE-NEXT: pandn %xmm1, %xmm4
9014 ; SSE-NEXT: por %xmm0, %xmm4
9015 ; SSE-NEXT: shufpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9016 ; SSE-NEXT: # xmm3 = xmm3[1],mem[0]
9017 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9018 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
9019 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm0[2,1]
9020 ; SSE-NEXT: andps %xmm2, %xmm3
9021 ; SSE-NEXT: pandn %xmm4, %xmm2
9022 ; SSE-NEXT: por %xmm3, %xmm2
9023 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9024 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9025 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
9026 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9027 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
9028 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9029 ; SSE-NEXT: # xmm1 = mem[0,0,1,1]
9030 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm10[0,0,1,1]
9031 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9032 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,0,0,65535]
9033 ; SSE-NEXT: movdqa %xmm4, %xmm10
9034 ; SSE-NEXT: pandn %xmm1, %xmm10
9035 ; SSE-NEXT: andps %xmm4, %xmm0
9036 ; SSE-NEXT: por %xmm0, %xmm10
9037 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9038 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9039 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
9040 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9041 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
9042 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9043 ; SSE-NEXT: # xmm1 = mem[0,0,1,1]
9044 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,0,1,1]
9045 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9046 ; SSE-NEXT: movdqa %xmm4, %xmm8
9047 ; SSE-NEXT: pandn %xmm1, %xmm8
9048 ; SSE-NEXT: andps %xmm4, %xmm0
9049 ; SSE-NEXT: por %xmm0, %xmm8
9050 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9051 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9052 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0]
9053 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9054 ; SSE-NEXT: # xmm0 = xmm0[2,0],mem[2,1]
9055 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9056 ; SSE-NEXT: # xmm1 = mem[0,0,1,1]
9057 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,1,1]
9058 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9059 ; SSE-NEXT: movdqa %xmm4, %xmm6
9060 ; SSE-NEXT: pandn %xmm1, %xmm6
9061 ; SSE-NEXT: andps %xmm4, %xmm0
9062 ; SSE-NEXT: por %xmm0, %xmm6
9063 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9064 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9065 ; SSE-NEXT: # xmm13 = xmm13[0],mem[0]
9066 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9067 ; SSE-NEXT: # xmm13 = xmm13[2,0],mem[2,1]
9068 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9069 ; SSE-NEXT: # xmm0 = mem[0,0,1,1]
9070 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
9071 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9072 ; SSE-NEXT: movdqa %xmm4, %xmm3
9073 ; SSE-NEXT: pandn %xmm0, %xmm3
9074 ; SSE-NEXT: andps %xmm4, %xmm13
9075 ; SSE-NEXT: por %xmm13, %xmm3
9076 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9077 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9078 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0]
9079 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9080 ; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1]
9081 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9082 ; SSE-NEXT: # xmm0 = mem[0,0,1,1]
9083 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9084 ; SSE-NEXT: # xmm13 = mem[0,0,1,1]
9085 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
9086 ; SSE-NEXT: movdqa %xmm4, %xmm13
9087 ; SSE-NEXT: pandn %xmm0, %xmm13
9088 ; SSE-NEXT: andps %xmm4, %xmm1
9089 ; SSE-NEXT: por %xmm1, %xmm13
9090 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9091 ; SSE-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9092 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0]
9093 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9094 ; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1]
9095 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9096 ; SSE-NEXT: # xmm0 = mem[0,0,1,1]
9097 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[0,0,1,1]
9098 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
9099 ; SSE-NEXT: movdqa %xmm4, %xmm15
9100 ; SSE-NEXT: pandn %xmm0, %xmm15
9101 ; SSE-NEXT: andps %xmm4, %xmm1
9102 ; SSE-NEXT: por %xmm1, %xmm15
9103 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9104 ; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
9105 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0]
9106 ; SSE-NEXT: shufps $98, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9107 ; SSE-NEXT: # xmm1 = xmm1[2,0],mem[2,1]
9108 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9109 ; SSE-NEXT: # xmm0 = mem[0,0,1,1]
9110 ; SSE-NEXT: pshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9111 ; SSE-NEXT: # xmm14 = mem[0,0,1,1]
9112 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
9113 ; SSE-NEXT: andps %xmm4, %xmm1
9114 ; SSE-NEXT: pandn %xmm0, %xmm4
9115 ; SSE-NEXT: por %xmm1, %xmm4
9116 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
9117 ; SSE-NEXT: movdqa %xmm4, 672(%rax)
9118 ; SSE-NEXT: movdqa %xmm15, 560(%rax)
9119 ; SSE-NEXT: movdqa %xmm13, 448(%rax)
9120 ; SSE-NEXT: movdqa %xmm3, 336(%rax)
9121 ; SSE-NEXT: movdqa %xmm6, 224(%rax)
9122 ; SSE-NEXT: movdqa %xmm8, 112(%rax)
9123 ; SSE-NEXT: movdqa %xmm10, (%rax)
9124 ; SSE-NEXT: movdqa %xmm2, 736(%rax)
9125 ; SSE-NEXT: movaps %xmm5, 624(%rax)
9126 ; SSE-NEXT: movaps %xmm7, 512(%rax)
9127 ; SSE-NEXT: movaps %xmm9, 400(%rax)
9128 ; SSE-NEXT: movaps %xmm11, 288(%rax)
9129 ; SSE-NEXT: movaps %xmm12, 176(%rax)
9130 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9131 ; SSE-NEXT: movaps %xmm0, 64(%rax)
9132 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9133 ; SSE-NEXT: movaps %xmm0, 864(%rax)
9134 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9135 ; SSE-NEXT: movaps %xmm0, 784(%rax)
9136 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9137 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9138 ; SSE-NEXT: movaps %xmm0, 752(%rax)
9139 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9140 ; SSE-NEXT: movaps %xmm0, 720(%rax)
9141 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9142 ; SSE-NEXT: movaps %xmm0, 704(%rax)
9143 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9144 ; SSE-NEXT: movaps %xmm0, 688(%rax)
9145 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9146 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9147 ; SSE-NEXT: movaps %xmm0, 640(%rax)
9148 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9149 ; SSE-NEXT: movaps %xmm0, 608(%rax)
9150 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9151 ; SSE-NEXT: movaps %xmm0, 592(%rax)
9152 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9153 ; SSE-NEXT: movaps %xmm0, 576(%rax)
9154 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9155 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9156 ; SSE-NEXT: movaps %xmm0, 528(%rax)
9157 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9158 ; SSE-NEXT: movaps %xmm0, 496(%rax)
9159 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9160 ; SSE-NEXT: movaps %xmm0, 480(%rax)
9161 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9162 ; SSE-NEXT: movaps %xmm0, 464(%rax)
9163 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9164 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9165 ; SSE-NEXT: movaps %xmm0, 416(%rax)
9166 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9167 ; SSE-NEXT: movaps %xmm0, 384(%rax)
9168 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9169 ; SSE-NEXT: movaps %xmm0, 368(%rax)
9170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9171 ; SSE-NEXT: movaps %xmm0, 352(%rax)
9172 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9173 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9174 ; SSE-NEXT: movaps %xmm0, 304(%rax)
9175 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9176 ; SSE-NEXT: movaps %xmm0, 272(%rax)
9177 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9178 ; SSE-NEXT: movaps %xmm0, 256(%rax)
9179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9180 ; SSE-NEXT: movaps %xmm0, 240(%rax)
9181 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9182 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9183 ; SSE-NEXT: movaps %xmm0, 192(%rax)
9184 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9185 ; SSE-NEXT: movaps %xmm0, 160(%rax)
9186 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9187 ; SSE-NEXT: movaps %xmm0, 144(%rax)
9188 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9189 ; SSE-NEXT: movaps %xmm0, 128(%rax)
9190 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9191 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
9192 ; SSE-NEXT: movaps %xmm0, 80(%rax)
9193 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9194 ; SSE-NEXT: movaps %xmm0, 48(%rax)
9195 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9196 ; SSE-NEXT: movaps %xmm0, 32(%rax)
9197 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9198 ; SSE-NEXT: movaps %xmm0, 16(%rax)
9199 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9200 ; SSE-NEXT: movaps %xmm0, 656(%rax)
9201 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9202 ; SSE-NEXT: movaps %xmm0, 544(%rax)
9203 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9204 ; SSE-NEXT: movaps %xmm0, 432(%rax)
9205 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9206 ; SSE-NEXT: movaps %xmm0, 320(%rax)
9207 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9208 ; SSE-NEXT: movaps %xmm0, 208(%rax)
9209 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9210 ; SSE-NEXT: movaps %xmm0, 96(%rax)
9211 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9212 ; SSE-NEXT: movaps %xmm0, 880(%rax)
9213 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9214 ; SSE-NEXT: movaps %xmm0, 816(%rax)
9215 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9216 ; SSE-NEXT: movaps %xmm0, 768(%rax)
9217 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9218 ; SSE-NEXT: movaps %xmm0, 848(%rax)
9219 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9220 ; SSE-NEXT: movaps %xmm0, 832(%rax)
9221 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9222 ; SSE-NEXT: movaps %xmm0, 800(%rax)
9223 ; SSE-NEXT: addq $1640, %rsp # imm = 0x668
9226 ; AVX-LABEL: store_i16_stride7_vf64:
9228 ; AVX-NEXT: subq $1496, %rsp # imm = 0x5D8
9229 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9230 ; AVX-NEXT: vmovdqa 112(%rsi), %xmm1
9231 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm4
9232 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
9233 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,1,0,1]
9234 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9235 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
9236 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
9237 ; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0
9238 ; AVX-NEXT: vmovdqa 112(%rdx), %xmm6
9239 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,2,2,2]
9240 ; AVX-NEXT: vmovdqa 112(%rcx), %xmm8
9241 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,3,3,3,4,5,6,7]
9242 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
9243 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
9244 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
9245 ; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9]
9246 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
9247 ; AVX-NEXT: vandps %ymm2, %ymm14, %ymm2
9248 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm7
9249 ; AVX-NEXT: vmovdqa 112(%r8), %xmm0
9250 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
9251 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm7[0],xmm2[1],xmm7[2,3,4,5,6,7]
9252 ; AVX-NEXT: vmovdqa 112(%r9), %xmm2
9253 ; AVX-NEXT: vmovdqa 112(%rax), %xmm3
9254 ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
9255 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,2,2,2]
9256 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0],xmm7[1,2,3,4,5,6],xmm11[7]
9257 ; AVX-NEXT: vpsrldq {{.*#+}} xmm11 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9258 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5,6,7]
9259 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,2,3,3]
9260 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm11[2],xmm7[3,4,5,6,7]
9261 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9262 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm7
9263 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
9264 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
9265 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,4,5,6,6]
9266 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
9267 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
9268 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
9269 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
9270 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9271 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
9272 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
9273 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
9274 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
9275 ; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11
9276 ; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12
9277 ; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
9278 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
9279 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7]
9280 ; AVX-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
9281 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
9282 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,0,1]
9283 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5],xmm12[6,7]
9284 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9285 ; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11
9286 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm0[2,3],xmm11[4,5,6,7]
9287 ; AVX-NEXT: vpslld $16, %xmm2, %xmm12
9288 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7]
9289 ; AVX-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm3[0],zero,xmm3[1],zero
9290 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5,6,7]
9291 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9292 ; AVX-NEXT: vpsrlq $48, %xmm8, %xmm8
9293 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm6[1],xmm8[1]
9294 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm11
9295 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
9296 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
9297 ; AVX-NEXT: vandnps %ymm8, %ymm14, %ymm8
9298 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,3,3]
9299 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9300 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5
9301 ; AVX-NEXT: vandps %ymm5, %ymm14, %ymm5
9302 ; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5
9303 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5],xmm0[6,7]
9304 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm2[0,1,2,3,5,6,6,7]
9305 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
9306 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2,3,4,5,6],xmm9[7]
9307 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
9308 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5,6,7]
9309 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9310 ; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
9311 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3]
9312 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5],xmm5[6,7]
9313 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm11[6,7]
9314 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[3,3,3,3]
9315 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0],xmm5[1,2,3,4,5,6],xmm8[7]
9316 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9317 ; AVX-NEXT: vmovdqa 96(%rdx), %xmm8
9318 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9319 ; AVX-NEXT: vmovdqa 96(%rcx), %xmm5
9320 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9321 ; AVX-NEXT: vpsrlq $48, %xmm5, %xmm5
9322 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm8[1],xmm5[1]
9323 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,0,1,1]
9324 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
9325 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9326 ; AVX-NEXT: vmovdqa 96(%rsi), %xmm4
9327 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9328 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
9329 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9330 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm7
9331 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9332 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
9333 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9334 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9335 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
9336 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
9337 ; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm4
9338 ; AVX-NEXT: vandps %ymm7, %ymm1, %ymm1
9339 ; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1
9340 ; AVX-NEXT: vmovdqa 96(%r8), %xmm4
9341 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9342 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
9343 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4],xmm4[5],xmm1[6,7]
9344 ; AVX-NEXT: vmovdqa 96(%r9), %xmm5
9345 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9346 ; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
9347 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm5[6,7]
9348 ; AVX-NEXT: vmovdqa 96(%rax), %xmm5
9349 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9350 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
9351 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6],xmm5[7]
9352 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9353 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
9354 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
9355 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
9356 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
9357 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
9358 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2],xmm3[3],xmm6[4,5,6,7]
9359 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9360 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
9361 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
9362 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9363 ; AVX-NEXT: vmovdqa (%rdx), %xmm10
9364 ; AVX-NEXT: vmovdqa (%rcx), %xmm9
9365 ; AVX-NEXT: vpsrld $16, %xmm9, %xmm0
9366 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
9367 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9368 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9369 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9370 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
9371 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
9372 ; AVX-NEXT: vmovdqa (%rsi), %xmm1
9373 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
9374 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9375 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,2,4,5,6,7]
9376 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
9377 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9378 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
9379 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
9380 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
9381 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
9382 ; AVX-NEXT: vandnps %ymm3, %ymm11, %ymm3
9383 ; AVX-NEXT: vandps %ymm4, %ymm11, %ymm4
9384 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm5
9385 ; AVX-NEXT: vmovdqa (%r9), %xmm4
9386 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9387 ; AVX-NEXT: vmovdqa (%r8), %xmm3
9388 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9389 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9390 ; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9391 ; AVX-NEXT: vmovdqa (%rax), %xmm4
9392 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1]
9393 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
9394 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1]
9395 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0]
9396 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
9397 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
9398 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
9399 ; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
9400 ; AVX-NEXT: vandnps %ymm6, %ymm7, %ymm6
9401 ; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5
9402 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9403 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm5
9404 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
9405 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
9406 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9407 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
9408 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
9409 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7]
9410 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
9411 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,2,2]
9412 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6],xmm2[7]
9413 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
9414 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
9415 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
9416 ; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
9417 ; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
9418 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
9419 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9420 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9421 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9422 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
9423 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3]
9424 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
9425 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
9426 ; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
9427 ; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
9428 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9429 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9430 ; AVX-NEXT: vmovdqa 16(%rdx), %xmm5
9431 ; AVX-NEXT: vmovdqa 16(%rcx), %xmm6
9432 ; AVX-NEXT: vpsrld $16, %xmm6, %xmm0
9433 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
9434 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
9435 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9436 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
9437 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
9438 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
9439 ; AVX-NEXT: vmovdqa 16(%rsi), %xmm10
9440 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
9441 ; AVX-NEXT: vpsrld $16, %xmm10, %xmm1
9442 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
9443 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9444 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9445 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9446 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
9447 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
9448 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
9449 ; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
9450 ; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
9451 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
9452 ; AVX-NEXT: vmovdqa 16(%r9), %xmm8
9453 ; AVX-NEXT: vmovdqa 16(%r8), %xmm7
9454 ; AVX-NEXT: vmovdqa 16(%rax), %xmm3
9455 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
9456 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9457 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9458 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9459 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
9460 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
9461 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3]
9462 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9463 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
9464 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
9465 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
9466 ; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
9467 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9468 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9469 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
9470 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9471 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
9472 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9473 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
9474 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9475 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7]
9476 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
9477 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9478 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,2,2]
9479 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
9480 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
9481 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9482 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
9483 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
9484 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
9485 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
9486 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
9487 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
9488 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
9489 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
9490 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9491 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9492 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
9493 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
9494 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9495 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
9496 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
9497 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
9498 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
9499 ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
9500 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9501 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9502 ; AVX-NEXT: vmovdqa 32(%rdx), %xmm15
9503 ; AVX-NEXT: vmovdqa 32(%rcx), %xmm13
9504 ; AVX-NEXT: vpsrld $16, %xmm13, %xmm0
9505 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
9506 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
9507 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9508 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9509 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
9510 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
9511 ; AVX-NEXT: vmovdqa 32(%rsi), %xmm2
9512 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm7
9513 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
9514 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7]
9515 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
9516 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
9517 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7]
9518 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
9519 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5
9520 ; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
9521 ; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
9522 ; AVX-NEXT: vorps %ymm1, %ymm5, %ymm8
9523 ; AVX-NEXT: vmovdqa 32(%r9), %xmm6
9524 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9525 ; AVX-NEXT: vmovdqa 32(%r8), %xmm1
9526 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9527 ; AVX-NEXT: vmovdqa 32(%rax), %xmm5
9528 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
9529 ; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9530 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1]
9531 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7]
9532 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1]
9533 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0]
9534 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7]
9535 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
9536 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
9537 ; AVX-NEXT: vandps %ymm3, %ymm8, %ymm8
9538 ; AVX-NEXT: vandnps %ymm9, %ymm3, %ymm9
9539 ; AVX-NEXT: vorps %ymm9, %ymm8, %ymm6
9540 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9541 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm8
9542 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
9543 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
9544 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9545 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
9546 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2
9547 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7]
9548 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
9549 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2]
9550 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7]
9551 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
9552 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
9553 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
9554 ; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2
9555 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
9556 ; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
9557 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9558 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9559 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7]
9560 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3]
9561 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
9562 ; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
9563 ; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
9564 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9565 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9566 ; AVX-NEXT: vmovdqa 48(%rdx), %xmm6
9567 ; AVX-NEXT: vmovdqa 48(%rcx), %xmm8
9568 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm0
9569 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
9570 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
9571 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9572 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
9573 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
9574 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
9575 ; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
9576 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm13
9577 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm1
9578 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
9579 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
9580 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9581 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9582 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
9583 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
9584 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
9585 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
9586 ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
9587 ; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
9588 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
9589 ; AVX-NEXT: vmovdqa 48(%r9), %xmm10
9590 ; AVX-NEXT: vmovdqa 48(%r8), %xmm9
9591 ; AVX-NEXT: vmovdqa 48(%rax), %xmm11
9592 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
9593 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9594 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9595 ; AVX-NEXT: vmovdqa %xmm7, %xmm2
9596 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9597 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1]
9598 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7]
9599 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3]
9600 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9601 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
9602 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
9603 ; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
9604 ; AVX-NEXT: vmovaps %ymm14, %ymm5
9605 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9606 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9607 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
9608 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9609 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
9610 ; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9611 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
9612 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9613 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,3,3,3,4,5,6,7]
9614 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
9615 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9616 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2]
9617 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7]
9618 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
9619 ; AVX-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9620 ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
9621 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
9622 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
9623 ; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
9624 ; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1
9625 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
9626 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
9627 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
9628 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9629 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3]
9630 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
9631 ; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9632 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7]
9633 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
9634 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
9635 ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
9636 ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
9637 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
9638 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9639 ; AVX-NEXT: vmovdqa 64(%rdx), %xmm2
9640 ; AVX-NEXT: vmovdqa 64(%rcx), %xmm7
9641 ; AVX-NEXT: vpsrld $16, %xmm7, %xmm0
9642 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
9643 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
9644 ; AVX-NEXT: vmovdqa %xmm7, %xmm0
9645 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9646 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9647 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1]
9648 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
9649 ; AVX-NEXT: vmovdqa 64(%rsi), %xmm8
9650 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm9
9651 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9652 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
9653 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
9654 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
9655 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
9656 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
9657 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
9658 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
9659 ; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
9660 ; AVX-NEXT: vandps %ymm6, %ymm7, %ymm7
9661 ; AVX-NEXT: vorps %ymm1, %ymm7, %ymm10
9662 ; AVX-NEXT: vmovdqa 64(%r9), %xmm1
9663 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9664 ; AVX-NEXT: vmovdqa 64(%r8), %xmm6
9665 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9666 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
9667 ; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9668 ; AVX-NEXT: vmovdqa 64(%rax), %xmm6
9669 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
9670 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7]
9671 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1]
9672 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0]
9673 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
9674 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
9675 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
9676 ; AVX-NEXT: vandps %ymm1, %ymm10, %ymm10
9677 ; AVX-NEXT: vandnps %ymm13, %ymm1, %ymm13
9678 ; AVX-NEXT: vorps %ymm13, %ymm10, %ymm1
9679 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9680 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm10
9681 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
9682 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
9683 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9684 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,1,0,1]
9685 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8
9686 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[3,3,3,3,4,5,6,7]
9687 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
9688 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2]
9689 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7]
9690 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6]
9691 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
9692 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
9693 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
9694 ; AVX-NEXT: vandnps %ymm8, %ymm1, %ymm8
9695 ; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
9696 ; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0
9697 ; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9698 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9699 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7]
9700 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3]
9701 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
9702 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
9703 ; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
9704 ; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
9705 ; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0
9706 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9707 ; AVX-NEXT: vmovdqa 80(%rdx), %xmm1
9708 ; AVX-NEXT: vmovdqa 80(%rcx), %xmm9
9709 ; AVX-NEXT: vpsrld $16, %xmm9, %xmm0
9710 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9711 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
9712 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9713 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6]
9714 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
9715 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
9716 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm8
9717 ; AVX-NEXT: vmovdqa 80(%rsi), %xmm12
9718 ; AVX-NEXT: vpsrld $16, %xmm12, %xmm7
9719 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
9720 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
9721 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9722 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9723 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
9724 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
9725 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7
9726 ; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
9727 ; AVX-NEXT: vandps %ymm4, %ymm7, %ymm7
9728 ; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7
9729 ; AVX-NEXT: vmovdqa 80(%r9), %xmm4
9730 ; AVX-NEXT: vmovdqa 80(%r8), %xmm3
9731 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9732 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9733 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9734 ; AVX-NEXT: vmovdqa 80(%rax), %xmm2
9735 ; AVX-NEXT: vmovdqa %xmm6, %xmm0
9736 ; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9737 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
9738 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7]
9739 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3]
9740 ; AVX-NEXT: vmovdqa %xmm0, %xmm10
9741 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9742 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6
9743 ; AVX-NEXT: vandps %ymm5, %ymm7, %ymm7
9744 ; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6
9745 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm0
9746 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9747 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
9748 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9749 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1]
9750 ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9751 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
9752 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7]
9753 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
9754 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9755 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,2,2]
9756 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm15[6],xmm7[7]
9757 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
9758 ; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9759 ; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
9760 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
9761 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
9762 ; AVX-NEXT: vandnps %ymm6, %ymm0, %ymm6
9763 ; AVX-NEXT: vandps %ymm0, %ymm7, %ymm7
9764 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
9765 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
9766 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
9767 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
9768 ; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
9769 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3]
9770 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7]
9771 ; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9772 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7]
9773 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7
9774 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
9775 ; AVX-NEXT: vandps %ymm0, %ymm6, %ymm6
9776 ; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
9777 ; AVX-NEXT: vorps %ymm7, %ymm6, %ymm0
9778 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9779 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9780 ; AVX-NEXT: vpsrld $16, %xmm8, %xmm6
9781 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9782 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
9783 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
9784 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1]
9785 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7
9786 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9787 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9788 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9789 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7]
9790 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
9791 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9792 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
9793 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
9794 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
9795 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
9796 ; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
9797 ; AVX-NEXT: vandps %ymm0, %ymm14, %ymm14
9798 ; AVX-NEXT: vorps %ymm7, %ymm14, %ymm14
9799 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9800 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
9801 ; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
9802 ; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9803 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9804 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1]
9805 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7]
9806 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1]
9807 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0]
9808 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7]
9809 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
9810 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
9811 ; AVX-NEXT: vandps %ymm3, %ymm14, %ymm13
9812 ; AVX-NEXT: vandnps %ymm12, %ymm3, %ymm12
9813 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm3
9814 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9815 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
9816 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
9817 ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9818 ; AVX-NEXT: # xmm13 = mem[0,1,0,1]
9819 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
9820 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7]
9821 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
9822 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
9823 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7]
9824 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6]
9825 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
9826 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
9827 ; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
9828 ; AVX-NEXT: vandnps %ymm12, %ymm10, %ymm12
9829 ; AVX-NEXT: vandps %ymm6, %ymm10, %ymm6
9830 ; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6
9831 ; AVX-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9832 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7]
9833 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3]
9834 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
9835 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
9836 ; AVX-NEXT: vandps %ymm0, %ymm6, %ymm6
9837 ; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
9838 ; AVX-NEXT: vorps %ymm7, %ymm6, %ymm0
9839 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9840 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9841 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9842 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
9843 ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9]
9844 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
9845 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9846 ; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9847 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3]
9848 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
9849 ; AVX-NEXT: vandnps %ymm6, %ymm10, %ymm6
9850 ; AVX-NEXT: vandps %ymm7, %ymm10, %ymm7
9851 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
9852 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9853 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9854 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9855 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
9856 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
9857 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9858 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3]
9859 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7]
9860 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9861 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
9862 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
9863 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3]
9864 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
9865 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12
9866 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
9867 ; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6
9868 ; AVX-NEXT: vandps %ymm5, %ymm12, %ymm12
9869 ; AVX-NEXT: vorps %ymm6, %ymm12, %ymm4
9870 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9871 ; AVX-NEXT: vpsrlq $48, %xmm15, %xmm6
9872 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm14[1],xmm6[1]
9873 ; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9874 ; AVX-NEXT: # xmm12 = mem[0,0,1,1]
9875 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
9876 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9877 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload
9878 ; AVX-NEXT: # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
9879 ; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9880 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
9881 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
9882 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
9883 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
9884 ; AVX-NEXT: vandnps %ymm6, %ymm2, %ymm6
9885 ; AVX-NEXT: vandps %ymm2, %ymm12, %ymm12
9886 ; AVX-NEXT: vorps %ymm6, %ymm12, %ymm12
9887 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9888 ; AVX-NEXT: # xmm6 = mem[0,1,0,1]
9889 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9890 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0]
9891 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5],xmm13[6,7]
9892 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm6
9893 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
9894 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
9895 ; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
9896 ; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4
9897 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13
9898 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
9899 ; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12
9900 ; AVX-NEXT: vandnps %ymm13, %ymm8, %ymm13
9901 ; AVX-NEXT: vorps %ymm13, %ymm12, %ymm0
9902 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9903 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9904 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm12
9905 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9906 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm0[1],xmm12[1]
9907 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9908 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
9909 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9910 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
9911 ; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9912 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
9913 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
9914 ; AVX-NEXT: vandnps %ymm12, %ymm0, %ymm12
9915 ; AVX-NEXT: vandps %ymm0, %ymm13, %ymm13
9916 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
9917 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9918 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm13
9919 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9920 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
9921 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9922 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
9923 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
9924 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
9925 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
9926 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7]
9927 ; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm13
9928 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13
9929 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
9930 ; AVX-NEXT: vandps %ymm7, %ymm12, %ymm12
9931 ; AVX-NEXT: vandnps %ymm13, %ymm7, %ymm13
9932 ; AVX-NEXT: vorps %ymm13, %ymm12, %ymm0
9933 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9934 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9935 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9936 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
9937 ; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9]
9938 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
9939 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9940 ; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
9941 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
9942 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
9943 ; AVX-NEXT: vandnps %ymm12, %ymm10, %ymm12
9944 ; AVX-NEXT: vandps %ymm10, %ymm13, %ymm13
9945 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
9946 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9947 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9948 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9949 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
9950 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
9951 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9952 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
9953 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
9954 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9955 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
9956 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
9957 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3]
9958 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6,7]
9959 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
9960 ; AVX-NEXT: vandnps %ymm12, %ymm5, %ymm12
9961 ; AVX-NEXT: vandps %ymm5, %ymm13, %ymm13
9962 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm5
9963 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9964 ; AVX-NEXT: vpsrlq $48, %xmm2, %xmm12
9965 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1]
9966 ; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9967 ; AVX-NEXT: # xmm13 = mem[0,0,1,1]
9968 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
9969 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9970 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
9971 ; AVX-NEXT: # xmm13 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9972 ; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
9973 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
9974 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
9975 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
9976 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
9977 ; AVX-NEXT: vandnps %ymm12, %ymm2, %ymm12
9978 ; AVX-NEXT: vandps %ymm2, %ymm13, %ymm13
9979 ; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
9980 ; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9981 ; AVX-NEXT: # xmm13 = mem[0,1,0,1]
9982 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0]
9983 ; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
9984 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm14
9985 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
9986 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
9987 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
9988 ; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm5
9989 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
9990 ; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12
9991 ; AVX-NEXT: vandnps %ymm5, %ymm8, %ymm5
9992 ; AVX-NEXT: vorps %ymm5, %ymm12, %ymm0
9993 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9994 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9995 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5
9996 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9997 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1]
9998 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
9999 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
10000 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10001 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
10002 ; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
10003 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
10004 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
10005 ; AVX-NEXT: vandnps %ymm5, %ymm0, %ymm5
10006 ; AVX-NEXT: vandps %ymm0, %ymm12, %ymm12
10007 ; AVX-NEXT: vorps %ymm5, %ymm12, %ymm5
10008 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10009 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
10010 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10011 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
10012 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
10013 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
10014 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
10015 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
10016 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,3,2,3]
10017 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7]
10018 ; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm12
10019 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2
10020 ; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
10021 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
10022 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm0
10023 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10024 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10025 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10026 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
10027 ; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
10028 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2
10029 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10030 ; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
10031 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
10032 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
10033 ; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2
10034 ; AVX-NEXT: vandps %ymm10, %ymm12, %ymm12
10035 ; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
10036 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10037 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10038 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
10039 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
10040 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
10041 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10042 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
10043 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
10044 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
10045 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
10046 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
10047 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
10048 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
10049 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
10050 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
10051 ; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2
10052 ; AVX-NEXT: vandps %ymm14, %ymm12, %ymm12
10053 ; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
10054 ; AVX-NEXT: vpsrlq $48, %xmm4, %xmm11
10055 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1]
10056 ; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10057 ; AVX-NEXT: # xmm12 = mem[0,0,1,1]
10058 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
10059 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10060 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload
10061 ; AVX-NEXT: # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
10062 ; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
10063 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
10064 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
10065 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
10066 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
10067 ; AVX-NEXT: vandnps %ymm11, %ymm3, %ymm11
10068 ; AVX-NEXT: vandps %ymm3, %ymm12, %ymm12
10069 ; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
10070 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
10071 ; AVX-NEXT: # xmm10 = mem[0,1,0,1]
10072 ; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
10073 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0]
10074 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7]
10075 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
10076 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
10077 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
10078 ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1
10079 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
10080 ; AVX-NEXT: vandps %ymm8, %ymm11, %ymm10
10081 ; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1
10082 ; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1
10083 ; AVX-NEXT: vpsrlq $48, %xmm9, %xmm4
10084 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10085 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1]
10086 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10087 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
10088 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10089 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,3,3]
10090 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
10091 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
10092 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
10093 ; AVX-NEXT: vandnps %ymm4, %ymm0, %ymm4
10094 ; AVX-NEXT: vandps %ymm0, %ymm8, %ymm8
10095 ; AVX-NEXT: vorps %ymm4, %ymm8, %ymm4
10096 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10097 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm8
10098 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10099 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
10100 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
10101 ; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm6
10102 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
10103 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7]
10104 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
10105 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
10106 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5,6,7]
10107 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
10108 ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
10109 ; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0
10110 ; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0
10111 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10112 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10113 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
10114 ; AVX-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
10115 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
10116 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10117 ; AVX-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
10118 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,3,3]
10119 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
10120 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
10121 ; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3
10122 ; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4
10123 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
10124 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10125 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10126 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
10127 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,2,2,2,4,5,6,7]
10128 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
10129 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10130 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
10131 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3,4,5,6,7]
10132 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
10133 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
10134 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
10135 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
10136 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7]
10137 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
10138 ; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3
10139 ; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
10140 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
10141 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
10142 ; AVX-NEXT: vmovaps %ymm3, 736(%rax)
10143 ; AVX-NEXT: vmovaps %ymm0, 640(%rax)
10144 ; AVX-NEXT: vmovaps %ymm1, 544(%rax)
10145 ; AVX-NEXT: vmovaps %ymm2, 512(%rax)
10146 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10147 ; AVX-NEXT: vmovaps %ymm0, 416(%rax)
10148 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10149 ; AVX-NEXT: vmovaps %ymm0, 320(%rax)
10150 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10151 ; AVX-NEXT: vmovaps %ymm0, 288(%rax)
10152 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10153 ; AVX-NEXT: vmovaps %ymm0, 192(%rax)
10154 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10155 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
10156 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10157 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
10158 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10159 ; AVX-NEXT: vmovaps %ymm0, 704(%rax)
10160 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10161 ; AVX-NEXT: vmovaps %ymm0, 672(%rax)
10162 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10163 ; AVX-NEXT: vmovaps %ymm0, 608(%rax)
10164 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10165 ; AVX-NEXT: vmovaps %ymm0, 576(%rax)
10166 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10167 ; AVX-NEXT: vmovaps %ymm0, 480(%rax)
10168 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10169 ; AVX-NEXT: vmovaps %ymm0, 448(%rax)
10170 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10171 ; AVX-NEXT: vmovaps %ymm0, 384(%rax)
10172 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10173 ; AVX-NEXT: vmovaps %ymm0, 352(%rax)
10174 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10175 ; AVX-NEXT: vmovaps %ymm0, 256(%rax)
10176 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10177 ; AVX-NEXT: vmovaps %ymm0, 224(%rax)
10178 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10179 ; AVX-NEXT: vmovaps %ymm0, 160(%rax)
10180 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10181 ; AVX-NEXT: vmovaps %ymm0, 128(%rax)
10182 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10183 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
10184 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10185 ; AVX-NEXT: vmovaps %ymm0, (%rax)
10186 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10187 ; AVX-NEXT: vmovaps %xmm0, 784(%rax)
10188 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10189 ; AVX-NEXT: vmovaps %xmm0, 768(%rax)
10190 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10191 ; AVX-NEXT: vmovaps %xmm0, 880(%rax)
10192 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10193 ; AVX-NEXT: vmovaps %xmm0, 864(%rax)
10194 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10195 ; AVX-NEXT: vmovaps %xmm0, 816(%rax)
10196 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10197 ; AVX-NEXT: vmovaps %xmm0, 800(%rax)
10198 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10199 ; AVX-NEXT: vmovaps %xmm0, 848(%rax)
10200 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10201 ; AVX-NEXT: vmovaps %xmm0, 832(%rax)
10202 ; AVX-NEXT: addq $1496, %rsp # imm = 0x5D8
10203 ; AVX-NEXT: vzeroupper
10206 ; AVX2-LABEL: store_i16_stride7_vf64:
10208 ; AVX2-NEXT: subq $1688, %rsp # imm = 0x698
10209 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10210 ; AVX2-NEXT: vmovdqa 64(%r8), %ymm4
10211 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10212 ; AVX2-NEXT: vmovdqa 32(%r8), %ymm3
10213 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10214 ; AVX2-NEXT: vmovdqa (%r8), %ymm0
10215 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10216 ; AVX2-NEXT: vmovdqa 64(%r9), %ymm5
10217 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10218 ; AVX2-NEXT: vmovdqa 32(%r9), %ymm8
10219 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10220 ; AVX2-NEXT: vmovdqa (%r9), %ymm1
10221 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10222 ; AVX2-NEXT: vmovdqa (%rax), %ymm9
10223 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10224 ; AVX2-NEXT: vmovdqa 32(%rax), %ymm7
10225 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10226 ; AVX2-NEXT: vmovdqa 64(%rax), %ymm6
10227 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10228 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,3,0,4]
10229 ; AVX2-NEXT: vpermd %ymm0, %ymm11, %ymm0
10230 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
10231 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
10232 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
10233 ; AVX2-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
10234 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm10 = [3,0,0,3,0,0,0,4]
10235 ; AVX2-NEXT: vpermd %ymm3, %ymm11, %ymm2
10236 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
10237 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
10238 ; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
10239 ; AVX2-NEXT: vpermd %ymm4, %ymm11, %ymm3
10240 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,3,4,5,4,7]
10241 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
10242 ; AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
10243 ; AVX2-NEXT: vpermd %ymm9, %ymm10, %ymm1
10244 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
10245 ; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm5
10246 ; AVX2-NEXT: vpermd %ymm7, %ymm10, %ymm0
10247 ; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm1
10248 ; AVX2-NEXT: vpermd %ymm6, %ymm10, %ymm0
10249 ; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
10250 ; AVX2-NEXT: vmovdqa (%rsi), %ymm2
10251 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10252 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7]
10253 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
10254 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
10255 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10256 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,0,0,0,4,0,0,4]
10257 ; AVX2-NEXT: vpermd %ymm3, %ymm7, %ymm3
10258 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
10259 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
10260 ; AVX2-NEXT: vmovdqa (%rcx), %ymm3
10261 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10262 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
10263 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
10264 ; AVX2-NEXT: vmovdqa (%rdx), %ymm4
10265 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10266 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,0,4,0,0]
10267 ; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4
10268 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
10269 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
10270 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
10271 ; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
10272 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
10273 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm5, %ymm2
10274 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10275 ; AVX2-NEXT: vmovdqa 32(%rsi), %ymm2
10276 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10277 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,3,2,3,4,7,6,7]
10278 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
10279 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
10280 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10281 ; AVX2-NEXT: vpermd %ymm4, %ymm7, %ymm4
10282 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm2
10283 ; AVX2-NEXT: vmovdqa 32(%rcx), %ymm4
10284 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10285 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
10286 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
10287 ; AVX2-NEXT: vmovdqa 32(%rdx), %ymm5
10288 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10289 ; AVX2-NEXT: vpermd %ymm5, %ymm9, %ymm5
10290 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm4
10291 ; AVX2-NEXT: vpblendvb %ymm10, %ymm2, %ymm4, %ymm2
10292 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
10293 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10294 ; AVX2-NEXT: vmovdqa 64(%rsi), %ymm1
10295 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10296 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,3,2,3,4,7,6,7]
10297 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
10298 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
10299 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10300 ; AVX2-NEXT: vpermd %ymm2, %ymm7, %ymm2
10301 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm1
10302 ; AVX2-NEXT: vmovdqa 64(%rcx), %ymm2
10303 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10304 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
10305 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
10306 ; AVX2-NEXT: vmovdqa 64(%rdx), %ymm4
10307 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10308 ; AVX2-NEXT: vpermd %ymm4, %ymm9, %ymm4
10309 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2
10310 ; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
10311 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
10312 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10313 ; AVX2-NEXT: vmovdqa 96(%rsi), %ymm3
10314 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
10315 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10316 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
10317 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5
10318 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,1,1,1,5,5,5,5]
10319 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
10320 ; AVX2-NEXT: vmovdqa 96(%rcx), %ymm4
10321 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10322 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
10323 ; AVX2-NEXT: vmovdqa 96(%rdx), %ymm6
10324 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[0,1,1,3,4,5,5,7]
10325 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
10326 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
10327 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
10328 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10329 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10330 ; AVX2-NEXT: vmovdqa 96(%r8), %ymm7
10331 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,0,2,1,4,4,6,5]
10332 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10333 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
10334 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
10335 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10336 ; AVX2-NEXT: vmovdqa 96(%r9), %ymm8
10337 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
10338 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
10339 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
10340 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10341 ; AVX2-NEXT: vmovdqa 96(%rax), %ymm9
10342 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,1,3,4,5,5,7]
10343 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
10344 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
10345 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10346 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10347 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
10348 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10349 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10350 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
10351 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
10352 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
10353 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
10354 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10355 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[2,2,2,2,6,6,6,6]
10356 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
10357 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
10358 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
10359 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10360 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10361 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7]
10362 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
10363 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
10364 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10365 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10366 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7]
10367 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
10368 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
10369 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
10370 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10371 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10372 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6]
10373 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
10374 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
10375 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10376 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10377 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10378 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
10379 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
10380 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
10381 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10382 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
10383 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
10384 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
10385 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
10386 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
10387 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
10388 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10389 ; AVX2-NEXT: vpbroadcastd 124(%r8), %ymm1
10390 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
10391 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10392 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
10393 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
10394 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
10395 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10396 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[2,3,3,3,6,7,7,7]
10397 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
10398 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
10399 ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
10400 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10401 ; AVX2-NEXT: vmovdqa (%rsi), %xmm1
10402 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10403 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
10404 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10405 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
10406 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
10407 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
10408 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
10409 ; AVX2-NEXT: vmovdqa (%rcx), %xmm0
10410 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10411 ; AVX2-NEXT: vmovdqa (%rdx), %xmm2
10412 ; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
10413 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10414 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
10415 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
10416 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
10417 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
10418 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm13
10419 ; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
10420 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10421 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
10422 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10423 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10424 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10425 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
10426 ; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2
10427 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10428 ; AVX2-NEXT: vmovdqa 32(%rdx), %xmm11
10429 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
10430 ; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10431 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
10432 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
10433 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
10434 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
10435 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10436 ; AVX2-NEXT: vmovdqa 64(%rsi), %xmm2
10437 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10438 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1
10439 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10440 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10441 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10442 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
10443 ; AVX2-NEXT: vmovdqa 64(%rcx), %xmm2
10444 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10445 ; AVX2-NEXT: vmovdqa 64(%rdx), %xmm9
10446 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
10447 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10448 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
10449 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
10450 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
10451 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
10452 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10453 ; AVX2-NEXT: vmovdqa 96(%rsi), %xmm2
10454 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10455 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
10456 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10457 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10458 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10459 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
10460 ; AVX2-NEXT: vmovdqa 96(%rcx), %xmm5
10461 ; AVX2-NEXT: vmovdqa 96(%rdx), %xmm8
10462 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
10463 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10464 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10465 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
10466 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
10467 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
10468 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm4
10469 ; AVX2-NEXT: vmovdqa (%r9), %xmm0
10470 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10471 ; AVX2-NEXT: vmovdqa (%r8), %xmm1
10472 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10473 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10474 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10475 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,3,2,4,5,6,7]
10476 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
10477 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
10478 ; AVX2-NEXT: vpbroadcastd (%rax), %ymm2
10479 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
10480 ; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
10481 ; AVX2-NEXT: vmovdqa 32(%r9), %xmm0
10482 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10483 ; AVX2-NEXT: vmovdqa 32(%r8), %xmm2
10484 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10485 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10486 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10487 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,1,3,2,4,5,6,7]
10488 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
10489 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
10490 ; AVX2-NEXT: vpbroadcastd 32(%rax), %ymm3
10491 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm7
10492 ; AVX2-NEXT: vmovdqa 64(%r9), %xmm0
10493 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10494 ; AVX2-NEXT: vmovdqa 64(%r8), %xmm2
10495 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10496 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10497 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
10498 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
10499 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
10500 ; AVX2-NEXT: vpbroadcastd 64(%rax), %ymm12
10501 ; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm12, %ymm0
10502 ; AVX2-NEXT: vmovdqa 96(%r9), %xmm10
10503 ; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10504 ; AVX2-NEXT: vmovdqa 96(%r8), %xmm2
10505 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10506 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
10507 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[0,1,3,2,4,5,6,7]
10508 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
10509 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
10510 ; AVX2-NEXT: vpbroadcastd 96(%rax), %ymm14
10511 ; AVX2-NEXT: vpblendvb %ymm6, %ymm15, %ymm14, %ymm6
10512 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
10513 ; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm1, %ymm1
10514 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10515 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm1 # 32-byte Folded Reload
10516 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10517 ; AVX2-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
10518 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10519 ; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm6, %ymm0
10520 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10521 ; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
10522 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10523 ; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
10524 ; AVX2-NEXT: vpshufd $165, (%rsp), %xmm1 # 16-byte Folded Reload
10525 ; AVX2-NEXT: # xmm1 = mem[1,1,2,2]
10526 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
10527 ; AVX2-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10528 ; AVX2-NEXT: # xmm1 = mem[3,3,3,3,4,5,6,7]
10529 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
10530 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10531 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[1,1,2,3]
10532 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7]
10533 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1]
10534 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
10535 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
10536 ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
10537 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10538 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10539 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
10540 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[1,1,2,2]
10541 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7]
10542 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10543 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,3,3,3,4,5,6,7]
10544 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
10545 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10546 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3]
10547 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0,1],xmm6[2],xmm14[3,4],xmm6[5],xmm14[6,7]
10548 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
10549 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
10550 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
10551 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10552 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10553 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
10554 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[1,1,2,2]
10555 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6],xmm1[7]
10556 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10557 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[3,3,3,3,4,5,6,7]
10558 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4]
10559 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10560 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[1,1,2,3]
10561 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7]
10562 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
10563 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
10564 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm14, %ymm1
10565 ; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm4
10566 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2]
10567 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm4[1],xmm14[2,3],xmm4[4],xmm14[5,6],xmm4[7]
10568 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10569 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[3,3,3,3,4,5,6,7]
10570 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,4,4]
10571 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10572 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[1,1,2,3]
10573 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1],xmm14[2],xmm15[3,4],xmm14[5],xmm15[6,7]
10574 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
10575 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
10576 ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm14, %ymm0
10577 ; AVX2-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10578 ; AVX2-NEXT: # xmm4 = mem[0,1,2,3,4,5,7,6]
10579 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3]
10580 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
10581 ; AVX2-NEXT: vpbroadcastd 4(%rax), %ymm8
10582 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
10583 ; AVX2-NEXT: vpblendvb %ymm14, %ymm4, %ymm8, %ymm4
10584 ; AVX2-NEXT: vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10585 ; AVX2-NEXT: # xmm5 = mem[0,1,2,3,4,5,7,6]
10586 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,2,3,3]
10587 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
10588 ; AVX2-NEXT: vpbroadcastd 36(%rax), %ymm8
10589 ; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm8, %ymm5
10590 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
10591 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
10592 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
10593 ; AVX2-NEXT: vpbroadcastd 68(%rax), %ymm8
10594 ; AVX2-NEXT: vpblendvb %ymm14, %ymm3, %ymm8, %ymm3
10595 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
10596 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,3,3]
10597 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
10598 ; AVX2-NEXT: vpbroadcastd 100(%rax), %ymm8
10599 ; AVX2-NEXT: vpblendvb %ymm14, %ymm2, %ymm8, %ymm2
10600 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
10601 ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
10602 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10603 ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
10604 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10605 ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
10606 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10607 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
10608 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10609 ; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
10610 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10611 ; AVX2-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10612 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10613 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
10614 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
10615 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,2,1]
10616 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
10617 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
10618 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
10619 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
10620 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
10621 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10622 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10623 ; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10624 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
10625 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
10626 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
10627 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
10628 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
10629 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
10630 ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
10631 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10632 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10633 ; AVX2-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
10634 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7]
10635 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
10636 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
10637 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
10638 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
10639 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
10640 ; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
10641 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10642 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10643 ; AVX2-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
10644 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
10645 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
10646 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
10647 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
10648 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
10649 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
10650 ; AVX2-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3
10651 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10652 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10653 ; AVX2-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
10654 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
10655 ; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
10656 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
10657 ; AVX2-NEXT: vpbroadcastd 8(%rax), %ymm5
10658 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
10659 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
10660 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10661 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10662 ; AVX2-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
10663 ; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
10664 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
10665 ; AVX2-NEXT: vpbroadcastd 40(%rax), %ymm7
10666 ; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5
10667 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10668 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
10669 ; AVX2-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
10670 ; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm7
10671 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
10672 ; AVX2-NEXT: vpbroadcastd 72(%rax), %ymm8
10673 ; AVX2-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7
10674 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10675 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10676 ; AVX2-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
10677 ; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8
10678 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
10679 ; AVX2-NEXT: vpbroadcastd 104(%rax), %ymm9
10680 ; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
10681 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
10682 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
10683 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10684 ; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0
10685 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10686 ; AVX2-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0
10687 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10688 ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0
10689 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10690 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,0,4,0,0,4]
10691 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
10692 ; AVX2-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10693 ; AVX2-NEXT: # ymm2 = mem[0,3,2,3,4,7,6,7]
10694 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
10695 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
10696 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
10697 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,0,4,0,0]
10698 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10699 ; AVX2-NEXT: vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10700 ; AVX2-NEXT: # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
10701 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
10702 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
10703 ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
10704 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
10705 ; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
10706 ; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4]
10707 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10708 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
10709 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
10710 ; AVX2-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10711 ; AVX2-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7]
10712 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
10713 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535]
10714 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
10715 ; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4]
10716 ; AVX2-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
10717 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
10718 ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
10719 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
10720 ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10721 ; AVX2-NEXT: # ymm2 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
10722 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
10723 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10724 ; AVX2-NEXT: # ymm3 = mem[1,1,1,1,5,5,5,5]
10725 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
10726 ; AVX2-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10727 ; AVX2-NEXT: # ymm3 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10728 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
10729 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10730 ; AVX2-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
10731 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
10732 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
10733 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
10734 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10735 ; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm1
10736 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10737 ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10738 ; AVX2-NEXT: # ymm3 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
10739 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
10740 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10741 ; AVX2-NEXT: # ymm4 = mem[1,1,1,1,5,5,5,5]
10742 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
10743 ; AVX2-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10744 ; AVX2-NEXT: # ymm4 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10745 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
10746 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10747 ; AVX2-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
10748 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
10749 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
10750 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
10751 ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm1
10752 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10753 ; AVX2-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10754 ; AVX2-NEXT: # ymm4 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
10755 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
10756 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10757 ; AVX2-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5]
10758 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
10759 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
10760 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10761 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
10762 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
10763 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,1,3,4,5,5,7]
10764 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
10765 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
10766 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
10767 ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0
10768 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10769 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10770 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10771 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
10772 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
10773 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[0,0,2,1,4,4,6,5]
10774 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
10775 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
10776 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
10777 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
10778 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
10779 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
10780 ; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
10781 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
10782 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10783 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
10784 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10785 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,0,2,1,4,4,6,5]
10786 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7,8,9,10],ymm8[11],ymm6[12,13],ymm8[14],ymm6[15]
10787 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
10788 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
10789 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm12[0,1,1,3,4,5,5,7]
10790 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
10791 ; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6
10792 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
10793 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
10794 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,0,0,4,4,4,4]
10795 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
10796 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,0,2,1,4,4,6,5]
10797 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15]
10798 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
10799 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
10800 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,1,1,3,4,5,5,7]
10801 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
10802 ; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
10803 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
10804 ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
10805 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10806 ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
10807 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10808 ; AVX2-NEXT: vpblendvb %ymm8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
10809 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10810 ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
10811 ; AVX2-NEXT: # ymm9 = mem[0,1,0,1]
10812 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10813 ; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm5
10814 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10815 ; AVX2-NEXT: # ymm6 = mem[2,2,2,2,6,6,6,6]
10816 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
10817 ; AVX2-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10818 ; AVX2-NEXT: # ymm6 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
10819 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
10820 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
10821 ; AVX2-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6]
10822 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
10823 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
10824 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
10825 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10826 ; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5
10827 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10828 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10829 ; AVX2-NEXT: vpshufb %ymm9, %ymm5, %ymm6
10830 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
10831 ; AVX2-NEXT: # ymm7 = mem[2,2,2,2,6,6,6,6]
10832 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
10833 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
10834 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
10835 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,2,6,6,6,6]
10836 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10837 ; AVX2-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6]
10838 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15]
10839 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
10840 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
10841 ; AVX2-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
10842 ; AVX2-NEXT: vpshufb %ymm9, %ymm2, %ymm7
10843 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[2,2,2,2,6,6,6,6]
10844 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
10845 ; AVX2-NEXT: vpshufhw $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10846 ; AVX2-NEXT: # ymm8 = mem[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
10847 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,2,2,2,6,6,6,6]
10848 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10849 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,2,2,6,6,6,6]
10850 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
10851 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
10852 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
10853 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10854 ; AVX2-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7
10855 ; AVX2-NEXT: vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10856 ; AVX2-NEXT: # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
10857 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
10858 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm15[1,2,2,3,5,6,6,7]
10859 ; AVX2-NEXT: vmovdqa %ymm15, %ymm2
10860 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
10861 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
10862 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6]
10863 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
10864 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
10865 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
10866 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
10867 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
10868 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm4[1,2,2,3,5,6,6,7]
10869 ; AVX2-NEXT: vmovdqa %ymm4, %ymm1
10870 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7,8,9],ymm9[10],ymm11[11,12],ymm9[13],ymm11[14,15]
10871 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
10872 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,2,4,5,6,6]
10873 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
10874 ; AVX2-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9
10875 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
10876 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
10877 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[1,2,2,3,5,6,6,7]
10878 ; AVX2-NEXT: vmovdqa %ymm13, %ymm15
10879 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
10880 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
10881 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[0,1,2,2,4,5,6,6]
10882 ; AVX2-NEXT: vmovdqa %ymm14, %ymm3
10883 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
10884 ; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm12, %ymm10
10885 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
10886 ; AVX2-NEXT: vpblendvb %ymm11, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
10887 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10888 ; AVX2-NEXT: vpblendvb %ymm11, %ymm6, %ymm9, %ymm6
10889 ; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm7
10890 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10891 ; AVX2-NEXT: # ymm8 = mem[3,3,3,3,7,7,7,7]
10892 ; AVX2-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
10893 ; AVX2-NEXT: # ymm9 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10894 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
10895 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15]
10896 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
10897 ; AVX2-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7]
10898 ; AVX2-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
10899 ; AVX2-NEXT: # ymm10 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10900 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
10901 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15]
10902 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
10903 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
10904 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
10905 ; AVX2-NEXT: vpblendvb %ymm4, %ymm8, %ymm9, %ymm8
10906 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
10907 ; AVX2-NEXT: # ymm9 = mem[3,3,3,3,7,7,7,7]
10908 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10909 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,2,2,6,6,6,6]
10910 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8,9,10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
10911 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
10912 ; AVX2-NEXT: # ymm10 = mem[3,3,3,3,7,7,7,7]
10913 ; AVX2-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10914 ; AVX2-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10915 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6]
10916 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13,14,15]
10917 ; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
10918 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
10919 ; AVX2-NEXT: vpblendvb %ymm4, %ymm9, %ymm10, %ymm9
10920 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm0[3,3,3,3,7,7,7,7]
10921 ; AVX2-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10922 ; AVX2-NEXT: # ymm11 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10923 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6]
10924 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
10925 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10926 ; AVX2-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7]
10927 ; AVX2-NEXT: vpshufhw $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
10928 ; AVX2-NEXT: # ymm12 = mem[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
10929 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
10930 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13,14,15]
10931 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
10932 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
10933 ; AVX2-NEXT: vpblendvb %ymm4, %ymm10, %ymm11, %ymm10
10934 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm2[3,3,3,3,7,7,7,7]
10935 ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
10936 ; AVX2-NEXT: # ymm12 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
10937 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,3,6,6,6,7]
10938 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
10939 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
10940 ; AVX2-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
10941 ; AVX2-NEXT: # ymm12 = mem[2,3,3,3,6,7,7,7]
10942 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
10943 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
10944 ; AVX2-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
10945 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
10946 ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10947 ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
10948 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,3,6,6,6,7]
10949 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
10950 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
10951 ; AVX2-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10952 ; AVX2-NEXT: # ymm14 = mem[2,3,3,3,6,7,7,7]
10953 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,2]
10954 ; AVX2-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12
10955 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7]
10956 ; AVX2-NEXT: vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10957 ; AVX2-NEXT: # ymm15 = mem[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
10958 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,2,2,3,6,6,6,7]
10959 ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15]
10960 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
10961 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[2,3,3,3,6,7,7,7]
10962 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2]
10963 ; AVX2-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm13
10964 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
10965 ; AVX2-NEXT: vpblendvb %ymm14, %ymm8, %ymm11, %ymm8
10966 ; AVX2-NEXT: vpblendvb %ymm14, %ymm9, %ymm12, %ymm9
10967 ; AVX2-NEXT: vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
10968 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10969 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10970 ; AVX2-NEXT: vmovaps %ymm0, 544(%rax)
10971 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10972 ; AVX2-NEXT: vmovaps %ymm0, 320(%rax)
10973 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10974 ; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
10975 ; AVX2-NEXT: vmovdqa %ymm10, 640(%rax)
10976 ; AVX2-NEXT: vmovdqa %ymm7, 608(%rax)
10977 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10978 ; AVX2-NEXT: vmovaps %ymm0, 576(%rax)
10979 ; AVX2-NEXT: vmovdqa %ymm9, 416(%rax)
10980 ; AVX2-NEXT: vmovdqa %ymm6, 384(%rax)
10981 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10982 ; AVX2-NEXT: vmovaps %ymm0, 352(%rax)
10983 ; AVX2-NEXT: vmovdqa %ymm8, 192(%rax)
10984 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10985 ; AVX2-NEXT: vmovaps %ymm0, 160(%rax)
10986 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10987 ; AVX2-NEXT: vmovaps %ymm0, 128(%rax)
10988 ; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
10989 ; AVX2-NEXT: vmovaps %ymm0, 768(%rax)
10990 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10991 ; AVX2-NEXT: vmovaps %ymm0, 736(%rax)
10992 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10993 ; AVX2-NEXT: vmovaps %ymm0, 704(%rax)
10994 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10995 ; AVX2-NEXT: vmovaps %ymm0, 672(%rax)
10996 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10997 ; AVX2-NEXT: vmovaps %ymm0, 512(%rax)
10998 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
10999 ; AVX2-NEXT: vmovaps %ymm0, 480(%rax)
11000 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11001 ; AVX2-NEXT: vmovaps %ymm0, 448(%rax)
11002 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11003 ; AVX2-NEXT: vmovaps %ymm0, 288(%rax)
11004 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11005 ; AVX2-NEXT: vmovaps %ymm0, 256(%rax)
11006 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11007 ; AVX2-NEXT: vmovaps %ymm0, 224(%rax)
11008 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11009 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
11010 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11011 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
11012 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11013 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
11014 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11015 ; AVX2-NEXT: vmovaps %ymm0, 864(%rax)
11016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11017 ; AVX2-NEXT: vmovaps %ymm0, 832(%rax)
11018 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11019 ; AVX2-NEXT: vmovaps %ymm0, 800(%rax)
11020 ; AVX2-NEXT: addq $1688, %rsp # imm = 0x698
11021 ; AVX2-NEXT: vzeroupper
11024 ; AVX2-FP-LABEL: store_i16_stride7_vf64:
11025 ; AVX2-FP: # %bb.0:
11026 ; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608
11027 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm0
11028 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11029 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm2
11030 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11031 ; AVX2-FP-NEXT: vmovdqa 96(%rdx), %ymm3
11032 ; AVX2-FP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
11033 ; AVX2-FP-NEXT: vmovdqa 96(%rcx), %ymm5
11034 ; AVX2-FP-NEXT: vmovdqa 96(%r8), %ymm7
11035 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11036 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [3,0,0,0,4,0,0,4]
11037 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm1
11038 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
11039 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm2, %ymm4
11040 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
11041 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
11042 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,0,0,0,4,0,0]
11043 ; AVX2-FP-NEXT: vpermd %ymm3, %ymm10, %ymm4
11044 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
11045 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm5, %ymm6
11046 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
11047 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm4, %ymm6, %ymm4
11048 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
11049 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
11050 ; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,3,0,4]
11051 ; AVX2-FP-NEXT: vpermd %ymm7, %ymm0, %ymm4
11052 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
11053 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
11054 ; AVX2-FP-NEXT: vmovdqa 96(%r9), %ymm4
11055 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
11056 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm4, %ymm6
11057 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535]
11058 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1
11059 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11060 ; AVX2-FP-NEXT: vmovdqa 96(%rax), %ymm6
11061 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [3,0,0,3,0,0,0,4]
11062 ; AVX2-FP-NEXT: vpermd %ymm6, %ymm0, %ymm7
11063 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
11064 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
11065 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11066 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
11067 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11068 ; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm3
11069 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11070 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm1
11071 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm3, %ymm7
11072 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm7, %ymm3
11073 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
11074 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11075 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm0
11076 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11077 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm14, %ymm7
11078 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm8
11079 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm7, %ymm8, %ymm7
11080 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
11081 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11082 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm14, %ymm8
11083 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm0
11084 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11085 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm0, %ymm14
11086 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm8, %ymm14, %ymm1
11087 ; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0
11088 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11089 ; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2
11090 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11091 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm8
11092 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm14
11093 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm14, %ymm8
11094 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %ymm2
11095 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11096 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm0
11097 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11098 ; AVX2-FP-NEXT: vpermd %ymm2, %ymm10, %ymm14
11099 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
11100 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm14, %ymm15, %ymm14
11101 ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %ymm0
11102 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11103 ; AVX2-FP-NEXT: vpermd %ymm0, %ymm10, %ymm10
11104 ; AVX2-FP-NEXT: vmovdqa 64(%rcx), %ymm0
11105 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11106 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm11
11107 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm11, %ymm10
11108 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm3, %ymm8, %ymm3
11109 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm7, %ymm14, %ymm7
11110 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm0
11111 ; AVX2-FP-NEXT: vmovdqa (%r8), %ymm1
11112 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11113 ; AVX2-FP-NEXT: vmovdqa (%r9), %ymm2
11114 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11115 ; AVX2-FP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,3,0,4]
11116 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm12, %ymm8
11117 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm2, %ymm10
11118 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
11119 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
11120 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %ymm2
11121 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %ymm1
11123 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11124 ; AVX2-FP-NEXT: vpermd %ymm2, %ymm12, %ymm10
11125 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm2
11126 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm12
11127 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm12, %ymm10
11128 ; AVX2-FP-NEXT: vmovdqa 64(%r8), %ymm1
11129 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11130 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm12
11131 ; AVX2-FP-NEXT: vmovdqa 64(%r9), %ymm1
11132 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11133 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm1, %ymm9
11134 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm9, %ymm9
11135 ; AVX2-FP-NEXT: vmovdqa (%rax), %ymm1
11136 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11137 ; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4]
11138 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11
11139 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
11140 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
11141 ; AVX2-FP-NEXT: vmovdqa 32(%rax), %ymm1
11142 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11143 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11
11144 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
11145 ; AVX2-FP-NEXT: vmovdqa 64(%rax), %ymm1
11146 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11147 ; AVX2-FP-NEXT: vpermd %ymm1, %ymm2, %ymm11
11148 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
11149 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
11150 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm8, %ymm1
11151 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11152 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm7, %ymm10, %ymm1
11153 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11154 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm9, %ymm0
11155 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11156 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11157 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
11158 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11159 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5]
11160 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
11161 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11162 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
11163 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm8[0,1,1,3,4,5,5,7]
11164 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
11165 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
11166 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
11167 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11168 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11169 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11170 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5]
11171 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
11172 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
11173 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11174 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
11175 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
11176 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
11177 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11178 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
11179 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
11180 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
11181 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11182 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11183 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
11184 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[2,2,2,2,6,6,6,6]
11185 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
11186 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
11187 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[2,2,2,2,6,6,6,6]
11188 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
11189 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
11190 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
11191 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11192 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11193 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7]
11194 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
11195 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
11196 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11197 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
11198 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
11199 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
11200 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11201 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6]
11202 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
11203 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
11204 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
11205 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11206 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
11207 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
11208 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
11209 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
11210 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
11211 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
11212 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
11213 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
11214 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
11215 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11216 ; AVX2-FP-NEXT: vpbroadcastd 124(%r8), %ymm1
11217 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
11218 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11219 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
11220 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
11221 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
11222 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11223 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7]
11224 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
11225 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
11226 ; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11227 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11228 ; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
11229 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
11230 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
11231 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11232 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
11233 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
11234 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
11235 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1]
11236 ; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm12
11237 ; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
11238 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11239 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
11240 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11241 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
11242 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
11243 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3]
11244 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
11245 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
11246 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11247 ; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm3
11248 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11249 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
11250 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11251 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11252 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11253 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
11254 ; AVX2-FP-NEXT: vmovdqa 32(%rcx), %xmm14
11255 ; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm11
11256 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
11257 ; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11258 ; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11259 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
11260 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
11261 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
11262 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11263 ; AVX2-FP-NEXT: vmovdqa 64(%rsi), %xmm2
11264 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11265 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
11266 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11267 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
11268 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11269 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
11270 ; AVX2-FP-NEXT: vmovdqa 64(%rcx), %xmm3
11271 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11272 ; AVX2-FP-NEXT: vmovdqa 64(%rdx), %xmm10
11273 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
11274 ; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11275 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
11276 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
11277 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
11278 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11279 ; AVX2-FP-NEXT: vmovdqa 96(%rcx), %xmm2
11280 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11281 ; AVX2-FP-NEXT: vmovdqa 96(%rdx), %xmm3
11282 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11283 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
11284 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
11285 ; AVX2-FP-NEXT: vmovdqa 96(%rsi), %xmm3
11286 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11287 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm2
11288 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11289 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11290 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
11291 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
11292 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
11293 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7
11294 ; AVX2-FP-NEXT: vmovdqa (%r9), %xmm0
11295 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11296 ; AVX2-FP-NEXT: vmovdqa (%r8), %xmm1
11297 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11298 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11299 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
11300 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm1
11301 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
11302 ; AVX2-FP-NEXT: vpbroadcastd (%rax), %ymm2
11303 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
11304 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm5
11305 ; AVX2-FP-NEXT: vmovdqa 32(%r9), %xmm0
11306 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11307 ; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1
11308 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11309 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11310 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm2
11311 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
11312 ; AVX2-FP-NEXT: vpbroadcastd 32(%rax), %ymm3
11313 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm1
11314 ; AVX2-FP-NEXT: vmovdqa 64(%r9), %xmm0
11315 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11316 ; AVX2-FP-NEXT: vmovdqa 64(%r8), %xmm2
11317 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11318 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
11319 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
11320 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
11321 ; AVX2-FP-NEXT: vpbroadcastd 64(%rax), %ymm13
11322 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm13, %ymm0
11323 ; AVX2-FP-NEXT: vmovdqa 96(%r8), %xmm13
11324 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11325 ; AVX2-FP-NEXT: vmovdqa 96(%r9), %xmm2
11326 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11327 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
11328 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm4
11329 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
11330 ; AVX2-FP-NEXT: vpbroadcastd 96(%rax), %ymm15
11331 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm15, %ymm4
11332 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
11333 ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
11334 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11335 ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11336 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11337 ; AVX2-FP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11338 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11339 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm4, %ymm0
11340 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11341 ; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
11342 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm0
11343 ; AVX2-FP-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11344 ; AVX2-FP-NEXT: # xmm4 = mem[1,1,2,2]
11345 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
11346 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
11347 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
11348 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11349 ; AVX2-FP-NEXT: vpshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
11350 ; AVX2-FP-NEXT: # xmm6 = mem[1,1,2,3]
11351 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
11352 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
11353 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
11354 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
11355 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm6
11356 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm0
11357 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2]
11358 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
11359 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11360 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
11361 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11362 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,3]
11363 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
11364 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
11365 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
11366 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm4
11367 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11368 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
11369 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[1,1,2,2]
11370 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0],xmm0[1],xmm15[2,3],xmm0[4],xmm15[5,6],xmm0[7]
11371 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11372 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm15
11373 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11374 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,3]
11375 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3,4],xmm15[5],xmm13[6,7]
11376 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
11377 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,0,2,1]
11378 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm13, %ymm0
11379 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11380 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm1
11381 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11382 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,2]
11383 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0],xmm1[1],xmm13[2,3],xmm1[4],xmm13[5,6],xmm1[7]
11384 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11385 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm5
11386 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11387 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[1,1,2,3]
11388 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7]
11389 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
11390 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
11391 ; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1
11392 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
11393 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm9, %xmm7
11394 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
11395 ; AVX2-FP-NEXT: vpbroadcastd 4(%rax), %ymm9
11396 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
11397 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm9, %ymm7
11398 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm8
11399 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
11400 ; AVX2-FP-NEXT: vpbroadcastd 36(%rax), %ymm9
11401 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm9, %ymm8
11402 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
11403 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
11404 ; AVX2-FP-NEXT: vpbroadcastd 68(%rax), %ymm9
11405 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm3, %ymm9, %ymm3
11406 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
11407 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
11408 ; AVX2-FP-NEXT: vpbroadcastd 100(%rax), %ymm5
11409 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm5, %ymm2
11410 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
11411 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm6
11412 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11413 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm8, %ymm4
11414 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11415 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0
11416 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11417 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0
11418 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11419 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
11420 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11421 ; AVX2-FP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11422 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11423 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11424 ; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11425 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
11426 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
11427 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
11428 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
11429 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
11430 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
11431 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
11432 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11433 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
11434 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11435 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11436 ; AVX2-FP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
11437 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
11438 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
11439 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
11440 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
11441 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
11442 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11443 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
11444 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11445 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11446 ; AVX2-FP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
11447 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
11448 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
11449 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
11450 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
11451 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2
11452 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
11453 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
11454 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
11455 ; AVX2-FP-NEXT: # xmm5 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
11456 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
11457 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
11458 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
11459 ; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
11460 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11461 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11462 ; AVX2-FP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
11463 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
11464 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
11465 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
11466 ; AVX2-FP-NEXT: vpbroadcastd 8(%rax), %ymm5
11467 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
11468 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
11469 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11470 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11471 ; AVX2-FP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
11472 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm5
11473 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
11474 ; AVX2-FP-NEXT: vpbroadcastd 40(%rax), %ymm7
11475 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5
11476 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11477 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
11478 ; AVX2-FP-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
11479 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
11480 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
11481 ; AVX2-FP-NEXT: vpbroadcastd 72(%rax), %ymm8
11482 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm7, %ymm8, %ymm7
11483 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11484 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
11485 ; AVX2-FP-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
11486 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
11487 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
11488 ; AVX2-FP-NEXT: vpbroadcastd 104(%rax), %ymm9
11489 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
11490 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
11491 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm0, %ymm0
11492 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11493 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm1, %ymm0
11494 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
11495 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm7, %ymm2, %ymm0
11496 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11497 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm3, %ymm0
11498 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11499 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11500 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
11501 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,1]
11502 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm4
11503 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11504 ; AVX2-FP-NEXT: # ymm5 = mem[1,1,1,1,5,5,5,5]
11505 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
11506 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11507 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11508 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11509 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
11510 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
11511 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
11512 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
11513 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11514 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm0
11515 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11516 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11517 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm5
11518 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11519 ; AVX2-FP-NEXT: # ymm6 = mem[1,1,1,1,5,5,5,5]
11520 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
11521 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11522 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11523 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
11524 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
11525 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
11526 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
11527 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
11528 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm5, %ymm6, %ymm0
11529 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11530 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11531 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm12, %ymm6
11532 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
11533 ; AVX2-FP-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5]
11534 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
11535 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11536 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11537 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11538 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,1,3,4,5,5,7]
11539 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
11540 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
11541 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
11542 ; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm6, %ymm7, %ymm0
11543 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
11544 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11545 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm6, %ymm8
11546 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11547 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm14[0,0,2,1,4,4,6,5]
11548 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7,8,9,10],ymm9[11],ymm8[12,13],ymm9[14],ymm8[15]
11549 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
11550 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11551 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7]
11552 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
11553 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
11554 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
11555 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11556 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm9
11557 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11558 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm5[0,0,2,1,4,4,6,5]
11559 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
11560 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
11561 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11562 ; AVX2-FP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
11563 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
11564 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9
11565 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11566 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm7
11567 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11568 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,0,2,1,4,4,6,5]
11569 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7,8,9,10],ymm11[11],ymm7[12,13],ymm11[14],ymm7[15]
11570 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
11571 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11572 ; AVX2-FP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
11573 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
11574 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm7, %ymm11, %ymm7
11575 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
11576 ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
11577 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11578 ; AVX2-FP-NEXT: vpblendvb %ymm10, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload
11579 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11580 ; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm7, %ymm0
11581 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11582 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11583 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
11584 ; AVX2-FP-NEXT: # ymm11 = mem[0,1,0,1]
11585 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm7
11586 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11587 ; AVX2-FP-NEXT: # ymm8 = mem[2,2,2,2,6,6,6,6]
11588 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
11589 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm0 = [151522058,0,421010202,421010202]
11590 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11591 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm8
11592 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
11593 ; AVX2-FP-NEXT: # ymm9 = mem[2,2,2,2,6,6,6,6]
11594 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
11595 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
11596 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
11597 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11598 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm7
11599 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11600 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11601 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm7, %ymm8
11602 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11603 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,2,2,6,6,6,6]
11604 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
11605 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11606 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm9, %ymm9
11607 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
11608 ; AVX2-FP-NEXT: # ymm10 = mem[2,2,2,2,6,6,6,6]
11609 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
11610 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
11611 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
11612 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm8
11613 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm9
11614 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[2,2,2,2,6,6,6,6]
11615 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
11616 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm10
11617 ; AVX2-FP-NEXT: vmovdqa %ymm12, %ymm0
11618 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11619 ; AVX2-FP-NEXT: # ymm11 = mem[2,2,2,2,6,6,6,6]
11620 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13,14,15]
11621 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
11622 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
11623 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm9
11624 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
11625 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm11
11626 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7]
11627 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
11628 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
11629 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6]
11630 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
11631 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
11632 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
11633 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11634 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm12
11635 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[1,2,2,3,5,6,6,7]
11636 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
11637 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
11638 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11639 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm5[0,1,2,2,4,5,6,6]
11640 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
11641 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm14, %ymm12
11642 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm10
11643 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7]
11644 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7,8,9],ymm10[10],ymm14[11,12],ymm10[13],ymm14[14,15]
11645 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
11646 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11647 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm14 = ymm2[0,1,2,2,4,5,6,6]
11648 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,1,3,3]
11649 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10
11650 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
11651 ; AVX2-FP-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
11652 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11653 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm12, %ymm8
11654 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm10, %ymm9
11655 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
11656 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11657 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm10
11658 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11659 ; AVX2-FP-NEXT: # ymm11 = mem[3,3,3,3,7,7,7,7]
11660 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
11661 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11662 ; AVX2-FP-NEXT: vpmovsxdq {{.*#+}} ymm13 = [218894094,0,488382238,488382238]
11663 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm11
11664 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
11665 ; AVX2-FP-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7]
11666 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
11667 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
11668 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
11669 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
11670 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm10
11671 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11672 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm1, %ymm11
11673 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
11674 ; AVX2-FP-NEXT: # ymm12 = mem[3,3,3,3,7,7,7,7]
11675 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
11676 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11677 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm1, %ymm12
11678 ; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm1
11679 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm7[3,3,3,3,7,7,7,7]
11680 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7,8],ymm12[9],ymm13[10,11],ymm12[12],ymm13[13,14,15]
11681 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
11682 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
11683 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm11, %ymm12, %ymm11
11684 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm0, %ymm12
11685 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11686 ; AVX2-FP-NEXT: # ymm13 = mem[3,3,3,3,7,7,7,7]
11687 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8,9,10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
11688 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11689 ; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm13
11690 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11691 ; AVX2-FP-NEXT: # ymm14 = mem[3,3,3,3,7,7,7,7]
11692 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13,14,15]
11693 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
11694 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,2,2,3]
11695 ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm12, %ymm13, %ymm12
11696 ; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
11697 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,0,1]
11698 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm6, %ymm14
11699 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11700 ; AVX2-FP-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7]
11701 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
11702 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
11703 ; AVX2-FP-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11704 ; AVX2-FP-NEXT: # ymm15 = mem[2,3,3,3,6,7,7,7]
11705 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2]
11706 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11707 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
11708 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm15
11709 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11710 ; AVX2-FP-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7]
11711 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7,8,9],ymm1[10],ymm15[11,12],ymm1[13],ymm15[14,15]
11712 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
11713 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm5[2,3,3,3,6,7,7,7]
11714 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2]
11715 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm15, %ymm1
11716 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11717 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm13
11718 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[3,3,3,3,7,7,7,7]
11719 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15]
11720 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
11721 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm2[2,3,3,3,6,7,7,7]
11722 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2]
11723 ; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm15, %ymm0
11724 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
11725 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm10
11726 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm11, %ymm1, %ymm1
11727 ; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm12, %ymm0, %ymm0
11728 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11729 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11730 ; AVX2-FP-NEXT: vmovaps %ymm2, 544(%rax)
11731 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11732 ; AVX2-FP-NEXT: vmovaps %ymm2, 320(%rax)
11733 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11734 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax)
11735 ; AVX2-FP-NEXT: vmovdqa %ymm0, 640(%rax)
11736 ; AVX2-FP-NEXT: vmovdqa %ymm9, 608(%rax)
11737 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11738 ; AVX2-FP-NEXT: vmovaps %ymm0, 576(%rax)
11739 ; AVX2-FP-NEXT: vmovdqa %ymm1, 416(%rax)
11740 ; AVX2-FP-NEXT: vmovdqa %ymm8, 384(%rax)
11741 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11742 ; AVX2-FP-NEXT: vmovaps %ymm0, 352(%rax)
11743 ; AVX2-FP-NEXT: vmovdqa %ymm10, 192(%rax)
11744 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11745 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax)
11746 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11747 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax)
11748 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11749 ; AVX2-FP-NEXT: vmovaps %ymm0, 768(%rax)
11750 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11751 ; AVX2-FP-NEXT: vmovaps %ymm0, 736(%rax)
11752 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11753 ; AVX2-FP-NEXT: vmovaps %ymm0, 704(%rax)
11754 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11755 ; AVX2-FP-NEXT: vmovaps %ymm0, 672(%rax)
11756 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11757 ; AVX2-FP-NEXT: vmovaps %ymm0, 512(%rax)
11758 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11759 ; AVX2-FP-NEXT: vmovaps %ymm0, 480(%rax)
11760 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11761 ; AVX2-FP-NEXT: vmovaps %ymm0, 448(%rax)
11762 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
11763 ; AVX2-FP-NEXT: vmovaps %ymm0, 288(%rax)
11764 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11765 ; AVX2-FP-NEXT: vmovaps %ymm0, 256(%rax)
11766 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11767 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax)
11768 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11769 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
11770 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11771 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
11772 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11773 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
11774 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11775 ; AVX2-FP-NEXT: vmovaps %ymm0, 864(%rax)
11776 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11777 ; AVX2-FP-NEXT: vmovaps %ymm0, 832(%rax)
11778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11779 ; AVX2-FP-NEXT: vmovaps %ymm0, 800(%rax)
11780 ; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608
11781 ; AVX2-FP-NEXT: vzeroupper
11782 ; AVX2-FP-NEXT: retq
11784 ; AVX2-FCP-LABEL: store_i16_stride7_vf64:
11785 ; AVX2-FCP: # %bb.0:
11786 ; AVX2-FCP-NEXT: subq $1256, %rsp # imm = 0x4E8
11787 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
11788 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11789 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm1
11790 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11791 ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm2
11792 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11793 ; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm4
11794 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11795 ; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm3
11796 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11797 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [3,0,0,0,4,0,0,4]
11798 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0
11799 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
11800 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
11801 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
11802 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,0,0,0,4,0,0]
11803 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
11804 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
11805 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
11806 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
11807 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
11808 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11809 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,3,0,4]
11810 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1
11811 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
11812 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11813 ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm1
11814 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11815 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
11816 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535]
11817 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11818 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11819 ; AVX2-FCP-NEXT: vmovdqa 96(%rax), %ymm1
11820 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11821 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,0,0,3,0,0,0,4]
11822 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
11823 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
11824 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
11825 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11826 ; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11
11827 ; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm10
11828 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
11829 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm0
11830 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,0,2,1,4,4,6,5]
11831 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
11832 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
11833 ; AVX2-FCP-NEXT: vmovdqa (%rax), %ymm12
11834 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,4,5,4,5,5,7]
11835 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm3
11836 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
11837 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm3, %ymm3
11838 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm6
11839 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11840 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7
11841 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm0
11842 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11843 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,0,2,1,4,4,6,5]
11844 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7,8,9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
11845 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
11846 ; AVX2-FCP-NEXT: vmovdqa 32(%rax), %ymm4
11847 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11848 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
11849 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm6
11850 ; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm0
11851 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11852 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
11853 ; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm1
11854 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11855 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
11856 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
11857 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
11858 ; AVX2-FCP-NEXT: vmovdqa 64(%rax), %ymm1
11859 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11860 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm5, %ymm1
11861 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm4
11862 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5
11863 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11864 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1
11865 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11866 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
11867 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
11868 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
11869 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[1,1,1,1,5,5,5,5]
11870 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
11871 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm15
11872 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm13
11873 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11874 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11875 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[0,1,1,3,4,5,5,7]
11876 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11877 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15]
11878 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
11879 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
11880 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11881 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
11882 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
11883 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm0
11884 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11885 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
11886 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11887 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm1
11888 ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
11889 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
11890 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5]
11891 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
11892 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm0
11893 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11894 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
11895 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11896 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11897 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,1,1,3,4,5,5,7]
11898 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15]
11899 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
11900 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
11901 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
11902 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm6, %ymm0
11903 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11904 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
11905 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11906 ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm1
11907 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11908 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
11909 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[1,1,1,1,5,5,5,5]
11910 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7,8,9],ymm9[10],ymm1[11,12],ymm9[13],ymm1[14,15]
11911 ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm3
11912 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11913 ; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
11914 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11915 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
11916 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[0,1,1,3,4,5,5,7]
11917 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15]
11918 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
11919 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
11920 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
11921 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
11922 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11923 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
11924 ; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm5
11925 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11926 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1
11927 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm6
11928 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11929 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,2,2,3,5,6,6,7]
11930 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
11931 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
11932 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,5,2,2,6,6,6,6]
11933 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8
11934 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11935 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm9, %ymm2
11936 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
11937 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm12
11938 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm2
11939 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11940 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,2,2,3,5,6,6,7]
11941 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7,8,9],ymm2[10],ymm14[11,12],ymm2[13],ymm14[14,15]
11942 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
11943 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11944 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm14
11945 ; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm1
11946 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2
11947 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11948 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0
11949 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11950 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[1,2,2,3,5,6,6,7]
11951 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7,8,9],ymm0[10],ymm14[11,12],ymm0[13],ymm14[14,15]
11952 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
11953 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11954 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm14
11955 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0
11956 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11957 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
11958 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
11959 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm4
11960 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm13
11961 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6]
11962 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15]
11963 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm1 = [151522058,0,421010202,421010202]
11964 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11965 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm14
11966 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm0
11967 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11968 ; AVX2-FCP-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6]
11969 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
11970 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
11971 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
11972 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11973 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm4, %ymm14, %ymm4
11974 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
11975 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm4, %ymm12, %ymm1
11976 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11977 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11978 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm1
11979 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11980 ; AVX2-FCP-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6]
11981 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
11982 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
11983 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4
11984 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11985 ; AVX2-FCP-NEXT: # ymm15 = mem[2,2,2,2,6,6,6,6]
11986 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
11987 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
11988 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11989 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
11990 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
11991 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm2, %ymm1
11992 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11993 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11994 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
11995 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11996 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[2,2,2,2,6,6,6,6]
11997 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
11998 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11999 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
12000 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12001 ; AVX2-FCP-NEXT: # ymm4 = mem[2,2,2,2,6,6,6,6]
12002 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
12003 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
12004 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
12005 ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1
12006 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12007 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm0
12008 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12009 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
12010 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
12011 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm0
12012 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
12013 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12014 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12015 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7]
12016 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm5, %ymm1
12017 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
12018 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
12019 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12020 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12021 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[3,3,3,3,7,7,7,7]
12022 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm14[2],ymm0[3,4],ymm14[5],ymm0[6,7,8,9],ymm14[10],ymm0[11,12],ymm14[13],ymm0[14,15]
12023 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12024 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm14
12025 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm0, %ymm14, %ymm0
12026 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
12027 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm11[3,3,3,3,7,7,7,7]
12028 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7,8,9],ymm14[10],ymm2[11,12],ymm14[13],ymm2[14,15]
12029 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
12030 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm14
12031 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2
12032 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12033 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
12034 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12035 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm4
12036 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12037 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
12038 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15]
12039 ; AVX2-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [218894094,0,488382238,488382238]
12040 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12041 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14
12042 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12043 ; AVX2-FCP-NEXT: # ymm15 = mem[3,3,3,3,7,7,7,7]
12044 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
12045 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
12046 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
12047 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
12048 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm4, %ymm14, %ymm4
12049 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
12050 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
12051 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12052 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
12053 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm1
12054 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12055 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
12056 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7,8,9,10],ymm1[11],ymm4[12,13],ymm1[14],ymm4[15]
12057 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm4
12058 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12059 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm11[3,3,3,3,7,7,7,7]
12060 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
12061 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
12062 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
12063 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
12064 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm0
12065 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12066 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12067 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm0
12068 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12069 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
12070 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
12071 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12072 ; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1
12073 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7]
12074 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13,14,15]
12075 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
12076 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
12077 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12078 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12079 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
12080 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12081 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,0,0,0,4,0,0,4]
12082 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm0
12083 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
12084 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1
12085 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [0,65535,0,0,0,0,0,0,65535,0,0,0,0,0,0,65535]
12086 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
12087 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm1
12088 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2
12089 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1
12090 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm2
12091 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm3
12092 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
12093 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,3,0,0,0,4,0,0]
12094 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
12095 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
12096 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12097 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
12098 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,0,0,65535,0,0,0,0,0,0,65535,0,0,0,0,0]
12099 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm3
12100 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm4
12101 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm5
12102 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
12103 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm5
12104 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6
12105 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
12106 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535]
12107 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
12108 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
12109 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
12110 ; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,3,0,4]
12111 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
12112 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
12113 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12114 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
12115 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,0,65535,0,0,0,0,0,0,65535,0,0,0]
12116 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
12117 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
12118 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12119 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6
12120 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4
12121 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
12122 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12123 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7
12124 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm6, %ymm7, %ymm5
12125 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,0,0,3,0,0,0,4]
12126 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
12127 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0]
12128 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm3, %ymm6, %ymm3
12129 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
12130 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
12131 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
12132 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
12133 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
12134 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
12135 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12136 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm0
12137 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12138 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm0
12139 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12140 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12141 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
12142 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12143 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[1,1,1,1,5,5,5,5]
12144 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12145 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12146 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
12147 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12148 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,1,3,4,5,5,7]
12149 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
12150 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12151 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
12152 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
12153 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12154 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5]
12155 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12156 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1
12157 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
12158 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12159 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12160 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
12161 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
12162 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
12163 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12164 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12165 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
12166 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1
12167 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
12168 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12169 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12170 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
12171 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[2,2,2,2,6,6,6,6]
12172 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12173 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
12174 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[2,2,2,2,6,6,6,6]
12175 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
12176 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12177 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
12178 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0]
12179 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12180 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6]
12181 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1
12182 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
12183 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12184 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
12185 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
12186 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
12187 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12188 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6]
12189 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1
12190 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
12191 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12192 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12193 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
12194 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[3,3,3,3,7,7,7,7]
12195 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
12196 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
12197 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[3,3,3,3,7,7,7,7]
12198 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
12199 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
12200 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
12201 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
12202 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12203 ; AVX2-FCP-NEXT: vpbroadcastd 124(%r8), %ymm1
12204 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
12205 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12206 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
12207 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
12208 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
12209 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12210 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
12211 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm1, %ymm1
12212 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
12213 ; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
12214 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12215 ; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm15
12216 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
12217 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12218 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
12219 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12220 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
12221 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
12222 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1]
12223 ; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9
12224 ; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
12225 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12226 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
12227 ; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12228 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
12229 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12230 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3]
12231 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
12232 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm10
12233 ; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm3
12234 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12235 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
12236 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12237 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12238 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12239 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
12240 ; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm13
12241 ; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
12242 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
12243 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
12244 ; AVX2-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12245 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
12246 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
12247 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
12248 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12249 ; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm2
12250 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12251 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
12252 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12253 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
12254 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12255 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
12256 ; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm3
12257 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12258 ; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm8
12259 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
12260 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12261 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
12262 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
12263 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
12264 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12265 ; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm2
12266 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12267 ; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm3
12268 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12269 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
12270 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
12271 ; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm3
12272 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12273 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
12274 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12275 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12276 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12277 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
12278 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
12279 ; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm7
12280 ; AVX2-FCP-NEXT: vmovdqa (%r9), %xmm0
12281 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12282 ; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
12283 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12284 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12285 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12286 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
12287 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1
12288 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
12289 ; AVX2-FCP-NEXT: vpbroadcastd (%rax), %ymm3
12290 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0]
12291 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm6
12292 ; AVX2-FCP-NEXT: vmovdqa 32(%r9), %xmm0
12293 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12294 ; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm1
12295 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12296 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12297 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm3
12298 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
12299 ; AVX2-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5
12300 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm1
12301 ; AVX2-FCP-NEXT: vmovdqa 64(%r9), %xmm0
12302 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12303 ; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm3
12304 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12305 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
12306 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3
12307 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
12308 ; AVX2-FCP-NEXT: vpbroadcastd 64(%rax), %ymm11
12309 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm3, %ymm11, %ymm0
12310 ; AVX2-FCP-NEXT: vmovdqa 96(%r9), %xmm11
12311 ; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12312 ; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm3
12313 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12314 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
12315 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
12316 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
12317 ; AVX2-FCP-NEXT: vpbroadcastd 96(%rax), %ymm14
12318 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm14, %ymm2
12319 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
12320 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm10, %ymm6, %ymm6
12321 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12322 ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12323 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12324 ; AVX2-FCP-NEXT: vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12325 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12326 ; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm7, %ymm2, %ymm0
12327 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12328 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
12329 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0
12330 ; AVX2-FCP-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12331 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,2,2]
12332 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
12333 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [6,7,8,9,6,7,8,9,6,7,8,9,6,7,8,9]
12334 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm15, %xmm2
12335 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12336 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[1,1,2,3]
12337 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7]
12338 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
12339 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
12340 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
12341 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0
12342 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12343 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm0
12344 ; AVX2-FCP-NEXT: vpshufd $165, (%rsp), %xmm2 # 16-byte Folded Reload
12345 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,2,2]
12346 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
12347 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12348 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2
12349 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12350 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[1,1,2,3]
12351 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2],xmm14[3,4],xmm2[5],xmm14[6,7]
12352 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
12353 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
12354 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm4
12355 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12356 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12357 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[1,1,2,2]
12358 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7]
12359 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12360 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm14
12361 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12362 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,3]
12363 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3,4],xmm14[5],xmm11[6,7]
12364 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
12365 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
12366 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm11, %ymm2
12367 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12368 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
12369 ; AVX2-FCP-NEXT: vpshufd $165, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
12370 ; AVX2-FCP-NEXT: # xmm11 = mem[1,1,2,2]
12371 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3],xmm1[4],xmm11[5,6],xmm1[7]
12372 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
12373 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
12374 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12375 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[1,1,2,3]
12376 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1],xmm6[2],xmm11[3,4],xmm6[5],xmm11[6,7]
12377 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
12378 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
12379 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm6, %ymm1
12380 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
12381 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12382 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
12383 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
12384 ; AVX2-FCP-NEXT: vpbroadcastd 4(%rax), %ymm11
12385 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0]
12386 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm11, %ymm7
12387 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm11
12388 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
12389 ; AVX2-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12
12390 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm12, %ymm11
12391 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
12392 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
12393 ; AVX2-FCP-NEXT: vpbroadcastd 68(%rax), %ymm12
12394 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm12, %ymm5
12395 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
12396 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
12397 ; AVX2-FCP-NEXT: vpbroadcastd 100(%rax), %ymm6
12398 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3
12399 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
12400 ; AVX2-FCP-NEXT: vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12401 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12402 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm4, %ymm11, %ymm11
12403 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm12
12404 ; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm3, %ymm14
12405 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12406 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
12407 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12408 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12409 ; AVX2-FCP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
12410 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
12411 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
12412 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
12413 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
12414 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
12415 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,65535,65535,0,0,0,0,0,65535,65535,0,0,0,0]
12416 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
12417 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
12418 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
12419 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
12420 ; AVX2-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
12421 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12422 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,3]
12423 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
12424 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
12425 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
12426 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12427 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
12428 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12429 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
12430 ; AVX2-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
12431 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12432 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
12433 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
12434 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
12435 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm3, %ymm6, %ymm3
12436 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
12437 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
12438 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12439 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
12440 ; AVX2-FCP-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12441 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
12442 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
12443 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
12444 ; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm6, %ymm4
12445 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12446 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
12447 ; AVX2-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12448 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
12449 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
12450 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
12451 ; AVX2-FCP-NEXT: vpbroadcastd 8(%rax), %ymm6
12452 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,65535,0,0,0,0,0,65535,65535,0,0,0,0,0,65535,65535]
12453 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
12454 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12455 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
12456 ; AVX2-FCP-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12457 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
12458 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
12459 ; AVX2-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8
12460 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm6
12461 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12462 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
12463 ; AVX2-FCP-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12464 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
12465 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
12466 ; AVX2-FCP-NEXT: vpbroadcastd 72(%rax), %ymm9
12467 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm8
12468 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12469 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
12470 ; AVX2-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12471 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
12472 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
12473 ; AVX2-FCP-NEXT: vpbroadcastd 104(%rax), %ymm10
12474 ; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm7
12475 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
12476 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm5, %ymm1, %ymm1
12477 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm2, %ymm2
12478 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm8, %ymm3, %ymm3
12479 ; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
12480 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12481 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12482 ; AVX2-FCP-NEXT: vmovaps %ymm5, 544(%rax)
12483 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12484 ; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax)
12485 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12486 ; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rax)
12487 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12488 ; AVX2-FCP-NEXT: vmovaps %ymm5, 640(%rax)
12489 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12490 ; AVX2-FCP-NEXT: vmovaps %ymm5, 608(%rax)
12491 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12492 ; AVX2-FCP-NEXT: vmovaps %ymm5, 576(%rax)
12493 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12494 ; AVX2-FCP-NEXT: vmovaps %ymm5, 416(%rax)
12495 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12496 ; AVX2-FCP-NEXT: vmovaps %ymm5, 384(%rax)
12497 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12498 ; AVX2-FCP-NEXT: vmovaps %ymm5, 352(%rax)
12499 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12500 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rax)
12501 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12502 ; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rax)
12503 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12504 ; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rax)
12505 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12506 ; AVX2-FCP-NEXT: vmovaps %ymm5, 768(%rax)
12507 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 736(%rax)
12508 ; AVX2-FCP-NEXT: vmovdqa %ymm14, 704(%rax)
12509 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12510 ; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax)
12511 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 512(%rax)
12512 ; AVX2-FCP-NEXT: vmovdqa %ymm12, 480(%rax)
12513 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12514 ; AVX2-FCP-NEXT: vmovaps %ymm0, 448(%rax)
12515 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 288(%rax)
12516 ; AVX2-FCP-NEXT: vmovdqa %ymm11, 256(%rax)
12517 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12518 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
12519 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
12520 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12521 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
12522 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12523 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
12524 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12525 ; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rax)
12526 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12527 ; AVX2-FCP-NEXT: vmovaps %ymm0, 832(%rax)
12528 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12529 ; AVX2-FCP-NEXT: vmovaps %ymm0, 800(%rax)
12530 ; AVX2-FCP-NEXT: addq $1256, %rsp # imm = 0x4E8
12531 ; AVX2-FCP-NEXT: vzeroupper
12532 ; AVX2-FCP-NEXT: retq
12534 ; AVX512-LABEL: store_i16_stride7_vf64:
12536 ; AVX512-NEXT: subq $2808, %rsp # imm = 0xAF8
12537 ; AVX512-NEXT: vmovdqa 96(%rcx), %ymm6
12538 ; AVX512-NEXT: vmovdqa 96(%rdx), %ymm13
12539 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm7
12540 ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm8
12541 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
12542 ; AVX512-NEXT: vpshufb %ymm0, %ymm6, %ymm2
12543 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
12544 ; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm3
12545 ; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm17
12546 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
12547 ; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm2
12548 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
12549 ; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm3
12550 ; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm18
12551 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
12552 ; AVX512-NEXT: vmovdqa 64(%r9), %ymm2
12553 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12554 ; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2
12555 ; AVX512-NEXT: vmovdqa 64(%r8), %ymm3
12556 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12557 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
12558 ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3
12559 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12560 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12561 ; AVX512-NEXT: vmovdqa 64(%rcx), %ymm3
12562 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm2
12563 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
12564 ; AVX512-NEXT: vmovdqa 64(%rdx), %ymm4
12565 ; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3
12566 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm27
12567 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12568 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12569 ; AVX512-NEXT: vmovdqa 64(%rsi), %ymm3
12570 ; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm2
12571 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
12572 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4
12573 ; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm3
12574 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm22
12575 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12576 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12577 ; AVX512-NEXT: vmovdqa (%r9), %ymm2
12578 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12579 ; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2
12580 ; AVX512-NEXT: vmovdqa (%r8), %ymm3
12581 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12582 ; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3
12583 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12584 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12585 ; AVX512-NEXT: vmovdqa (%rcx), %ymm2
12586 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12587 ; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
12588 ; AVX512-NEXT: vmovdqa (%rdx), %ymm3
12589 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12590 ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
12591 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12592 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12593 ; AVX512-NEXT: vmovdqa (%rsi), %ymm2
12594 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12595 ; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm2
12596 ; AVX512-NEXT: vmovdqa (%rdi), %ymm3
12597 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12598 ; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
12599 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
12600 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12601 ; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4
12602 ; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0
12603 ; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2
12604 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
12605 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
12606 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12607 ; AVX512-NEXT: vmovdqa 32(%rsi), %ymm5
12608 ; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm0
12609 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
12610 ; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm1
12611 ; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
12612 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12613 ; AVX512-NEXT: vmovdqa 32(%r8), %ymm1
12614 ; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm9
12615 ; AVX512-NEXT: vmovdqa 32(%r9), %ymm0
12616 ; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm10
12617 ; AVX512-NEXT: vpor %ymm10, %ymm9, %ymm9
12618 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12619 ; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
12620 ; AVX512-NEXT: # ymm10 = mem[0,1,0,1]
12621 ; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm9
12622 ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm31
12623 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
12624 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
12625 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
12626 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12627 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12628 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
12629 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
12630 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
12631 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
12632 ; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12633 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
12634 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
12635 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
12636 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
12637 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
12638 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12639 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12640 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
12641 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
12642 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
12643 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
12644 ; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12645 ; AVX512-NEXT: vmovdqa 96(%r9), %ymm9
12646 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
12647 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
12648 ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12649 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
12650 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
12651 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
12652 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
12653 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12654 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
12655 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
12656 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
12657 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
12658 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
12659 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
12660 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
12661 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
12662 ; AVX512-NEXT: vprold $16, %ymm9, %ymm8
12663 ; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm9
12664 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6
12665 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
12666 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
12667 ; AVX512-NEXT: vmovdqa 96(%r8), %ymm6
12668 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
12669 ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12670 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
12671 ; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
12672 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
12673 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
12674 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12
12675 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
12676 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
12677 ; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6
12678 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
12679 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
12680 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
12681 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
12682 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
12683 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
12684 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
12685 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
12686 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12687 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
12688 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
12689 ; AVX512-NEXT: vmovdqa 96(%rax), %ymm6
12690 ; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm7
12691 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12692 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
12693 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
12694 ; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm6
12695 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
12696 ; AVX512-NEXT: vpandn %ymm7, %ymm12, %ymm7
12697 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19
12698 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
12699 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12700 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
12701 ; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm6
12702 ; AVX512-NEXT: vpandnq %ymm6, %ymm28, %ymm6
12703 ; AVX512-NEXT: vmovdqa 64(%rax), %ymm7
12704 ; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12705 ; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm7
12706 ; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
12707 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12708 ; AVX512-NEXT: vmovdqa 64(%r9), %xmm7
12709 ; AVX512-NEXT: vmovdqa 64(%r8), %xmm8
12710 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
12711 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm17
12712 ; AVX512-NEXT: vmovdqa64 %xmm7, %xmm20
12713 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
12714 ; AVX512-NEXT: vpshufb %xmm15, %xmm6, %xmm6
12715 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
12716 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12717 ; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9
12718 ; AVX512-NEXT: vmovdqa 64(%rdx), %xmm7
12719 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
12720 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
12721 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
12722 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12723 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm14
12724 ; AVX512-NEXT: vmovdqa 64(%rsi), %xmm10
12725 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
12726 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
12727 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
12728 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
12729 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12730 ; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm8
12731 ; AVX512-NEXT: vpandnq %ymm8, %ymm28, %ymm8
12732 ; AVX512-NEXT: vmovdqa (%rax), %ymm12
12733 ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12734 ; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm13
12735 ; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
12736 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12737 ; AVX512-NEXT: vmovdqa (%r9), %xmm6
12738 ; AVX512-NEXT: vmovdqa (%r8), %xmm12
12739 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
12740 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm29
12741 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24
12742 ; AVX512-NEXT: vpshufb %xmm15, %xmm8, %xmm8
12743 ; AVX512-NEXT: vmovdqa64 %xmm15, %xmm25
12744 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
12745 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12746 ; AVX512-NEXT: vmovdqa (%rcx), %xmm8
12747 ; AVX512-NEXT: vmovdqa (%rdx), %xmm13
12748 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
12749 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
12750 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
12751 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12752 ; AVX512-NEXT: vmovdqa (%rdi), %xmm6
12753 ; AVX512-NEXT: vmovdqa (%rsi), %xmm12
12754 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
12755 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21
12756 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm16
12757 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
12758 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
12759 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
12760 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12761 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12762 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
12763 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
12764 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
12765 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
12766 ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12767 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
12768 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
12769 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
12770 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
12771 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
12772 ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12773 ; AVX512-NEXT: vmovdqa 32(%rax), %ymm15
12774 ; AVX512-NEXT: vpshufb %ymm11, %ymm15, %ymm11
12775 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
12776 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
12777 ; AVX512-NEXT: vpandnq %ymm12, %ymm19, %ymm12
12778 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
12779 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12780 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12781 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
12782 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
12783 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
12784 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
12785 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12786 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm6
12787 ; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm11
12788 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
12789 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
12790 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
12791 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12792 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
12793 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12794 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
12795 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
12796 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
12797 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12798 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
12799 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
12800 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
12801 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
12802 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
12803 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12804 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
12805 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12806 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
12807 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
12808 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
12809 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12810 ; AVX512-NEXT: vprold $16, %ymm0, %ymm2
12811 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
12812 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
12813 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
12814 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12815 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
12816 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
12817 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
12818 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12819 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12820 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12821 ; AVX512-NEXT: vpermd %zmm15, %zmm18, %zmm0
12822 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12823 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm12
12824 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
12825 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
12826 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
12827 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12828 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12829 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12830 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
12831 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
12832 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
12833 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
12834 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
12835 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12836 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2
12837 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12838 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
12839 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
12840 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12841 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
12842 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12843 ; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm0
12844 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3
12845 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm11
12846 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
12847 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12848 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12849 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12850 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12851 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12852 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
12853 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12854 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
12855 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
12856 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
12857 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12858 ; AVX512-NEXT: vprold $16, %ymm5, %ymm0
12859 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
12860 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12861 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
12862 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12863 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
12864 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12865 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
12866 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
12867 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
12868 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12869 ; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0
12870 ; AVX512-NEXT: vmovdqa 96(%rdx), %xmm1
12871 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12872 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
12873 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
12874 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12875 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
12876 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
12877 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
12878 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
12879 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
12880 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12881 ; AVX512-NEXT: vmovdqa 96(%rsi), %xmm2
12882 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
12883 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
12884 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm12
12885 ; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm4
12886 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
12887 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12888 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
12889 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
12890 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
12891 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
12892 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12893 ; AVX512-NEXT: vprold $16, %xmm2, %xmm4
12894 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
12895 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
12896 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12897 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
12898 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12899 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
12900 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm2
12901 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
12902 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
12903 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12904 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
12905 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12906 ; AVX512-NEXT: vmovdqa 96(%r9), %xmm0
12907 ; AVX512-NEXT: vmovdqa 96(%r8), %xmm1
12908 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12909 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12910 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
12911 ; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
12912 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
12913 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
12914 ; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm0
12915 ; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm1
12916 ; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm2
12917 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31
12918 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
12919 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
12920 ; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm1
12921 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
12922 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
12923 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
12924 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
12925 ; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
12926 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
12927 ; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
12928 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
12929 ; AVX512-NEXT: vprold $16, %xmm10, %xmm3
12930 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
12931 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
12932 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
12933 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
12934 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
12935 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
12936 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1
12937 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3
12938 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12939 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
12940 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
12941 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
12942 ; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1
12943 ; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm3
12944 ; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5
12945 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25
12946 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
12947 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
12948 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
12949 ; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm3
12950 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
12951 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
12952 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
12953 ; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3
12954 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
12955 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2
12956 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
12957 ; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
12958 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
12959 ; AVX512-NEXT: vprold $16, %xmm21, %xmm2
12960 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
12961 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
12962 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
12963 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
12964 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
12965 ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0
12966 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
12967 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12968 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
12969 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
12970 ; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
12971 ; AVX512-NEXT: vpbroadcastd (%rax), %ymm1
12972 ; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2
12973 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
12974 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
12975 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12976 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
12977 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
12978 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12979 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
12980 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
12981 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
12982 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12983 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
12984 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
12985 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
12986 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
12987 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
12988 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12989 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12990 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
12991 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
12992 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12993 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
12994 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
12995 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
12996 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12997 ; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm0
12998 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
12999 ; AVX512-NEXT: vmovdqa %ymm3, %ymm11
13000 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
13001 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13002 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
13003 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
13004 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13005 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
13006 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
13007 ; AVX512-NEXT: vprold $16, %ymm7, %ymm0
13008 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
13009 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
13010 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
13011 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
13012 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
13013 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
13014 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
13015 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
13016 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
13017 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
13018 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
13019 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
13020 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
13021 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
13022 ; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13023 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
13024 ; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
13025 ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
13026 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9
13027 ; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
13028 ; AVX512-NEXT: vprold $16, %xmm10, %xmm1
13029 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
13030 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
13031 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13032 ; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
13033 ; AVX512-NEXT: vmovdqa 32(%r9), %xmm1
13034 ; AVX512-NEXT: vmovdqa 32(%r8), %xmm2
13035 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13036 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
13037 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
13038 ; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm3
13039 ; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
13040 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
13041 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm27
13042 ; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm1
13043 ; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2
13044 ; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4
13045 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13
13046 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
13047 ; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7
13048 ; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6
13049 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
13050 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
13051 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
13052 ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
13053 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
13054 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13055 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
13056 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13057 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
13058 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
13059 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
13060 ; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13061 ; AVX512-NEXT: # ymm3 = mem[2,1,3,2]
13062 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
13063 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
13064 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
13065 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13066 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
13067 ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
13068 ; AVX512-NEXT: # zmm3 = (zmm3 & zmm28) | mem
13069 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13070 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
13071 ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
13072 ; AVX512-NEXT: # zmm0 = (zmm0 & zmm28) | mem
13073 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13074 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
13075 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13076 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
13077 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
13078 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
13079 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
13080 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
13081 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13082 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
13083 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
13084 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
13085 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
13086 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
13087 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13088 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
13089 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13090 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
13091 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
13092 ; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0
13093 ; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
13094 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
13095 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13096 ; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
13097 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
13098 ; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
13099 ; AVX512-NEXT: # zmm19 = zmm19 | (zmm1 & mem)
13100 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13101 ; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
13102 ; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
13103 ; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
13104 ; AVX512-NEXT: # zmm0 = (zmm0 & zmm1) | mem
13105 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
13106 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
13107 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
13108 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
13109 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
13110 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13111 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
13112 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
13113 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
13114 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
13115 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
13116 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
13117 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13118 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
13119 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13120 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
13121 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
13122 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
13123 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13124 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
13125 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13126 ; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22
13127 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
13128 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13129 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
13130 ; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
13131 ; AVX512-NEXT: vpermd 64(%rax), %zmm14, %zmm5
13132 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
13133 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
13134 ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1
13135 ; AVX512-NEXT: vpermd (%rax), %zmm14, %zmm14
13136 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
13137 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
13138 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
13139 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
13140 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13141 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
13142 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13143 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
13144 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
13145 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13146 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13147 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13148 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
13149 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13150 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
13151 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
13152 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
13153 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1
13154 ; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8
13155 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
13156 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
13157 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
13158 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
13159 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
13160 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
13161 ; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
13162 ; AVX512-NEXT: # ymm7 = mem[2,2,2,3]
13163 ; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
13164 ; AVX512-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7]
13165 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
13166 ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13167 ; AVX512-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7]
13168 ; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
13169 ; AVX512-NEXT: # ymm16 = mem[0,0,2,1]
13170 ; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
13171 ; AVX512-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7]
13172 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
13173 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
13174 ; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
13175 ; AVX512-NEXT: # ymm21 = mem[0,0,1,1]
13176 ; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
13177 ; AVX512-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7]
13178 ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
13179 ; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
13180 ; AVX512-NEXT: # ymm24 = mem[2,2,2,3]
13181 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
13182 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
13183 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
13184 ; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
13185 ; AVX512-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7]
13186 ; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
13187 ; AVX512-NEXT: # ymm27 = mem[0,0,2,1]
13188 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
13189 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
13190 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
13191 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
13192 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
13193 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
13194 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
13195 ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
13196 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
13197 ; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm10
13198 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
13199 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
13200 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
13201 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
13202 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
13203 ; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm10
13204 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
13205 ; AVX512-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
13206 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
13207 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
13208 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
13209 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3
13210 ; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8
13211 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
13212 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1
13213 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3
13214 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
13215 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
13216 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
13217 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
13218 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
13219 ; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
13220 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13221 ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax)
13222 ; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax)
13223 ; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax)
13224 ; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax)
13225 ; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rax)
13226 ; AVX512-NEXT: vmovdqa64 %zmm20, (%rax)
13227 ; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax)
13228 ; AVX512-NEXT: vmovdqa64 %zmm31, 704(%rax)
13229 ; AVX512-NEXT: vmovdqa64 %zmm9, 640(%rax)
13230 ; AVX512-NEXT: vmovdqa64 %zmm5, 576(%rax)
13231 ; AVX512-NEXT: vmovdqa64 %zmm18, 512(%rax)
13232 ; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rax)
13233 ; AVX512-NEXT: vmovdqa64 %zmm19, 768(%rax)
13234 ; AVX512-NEXT: vmovdqa64 %zmm30, 832(%rax)
13235 ; AVX512-NEXT: addq $2808, %rsp # imm = 0xAF8
13236 ; AVX512-NEXT: vzeroupper
13237 ; AVX512-NEXT: retq
13239 ; AVX512-FCP-LABEL: store_i16_stride7_vf64:
13240 ; AVX512-FCP: # %bb.0:
13241 ; AVX512-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8
13242 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3
13243 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
13244 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0
13245 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm1
13246 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
13247 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2
13248 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16
13249 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4
13250 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
13251 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm0
13252 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
13253 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
13254 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm2
13255 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm7
13256 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
13257 ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm2
13258 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm0
13259 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
13260 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm12
13261 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
13262 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm2
13263 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm28
13264 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
13265 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13266 ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
13267 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
13268 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31
13269 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0
13270 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
13271 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
13272 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
13273 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13274 ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0
13275 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
13276 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
13277 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
13278 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
13279 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
13280 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
13281 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13282 ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0
13283 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13284 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14
13285 ; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0
13286 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13287 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm15
13288 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
13289 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13290 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0
13291 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13292 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
13293 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0
13294 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13295 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm12
13296 ; AVX512-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12
13297 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13298 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0
13299 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13300 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm12
13301 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
13302 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13303 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm14
13304 ; AVX512-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12
13305 ; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13306 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
13307 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8
13308 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm15
13309 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm9
13310 ; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
13311 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13312 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
13313 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
13314 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
13315 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm9
13316 ; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
13317 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13318 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8
13319 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm10
13320 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm9
13321 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11
13322 ; AVX512-FCP-NEXT: vporq %ymm11, %ymm10, %ymm22
13323 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
13324 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
13325 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
13326 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
13327 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
13328 ; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1]
13329 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm12
13330 ; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm30
13331 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
13332 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
13333 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
13334 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11
13335 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12
13336 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11))
13337 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
13338 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm7
13339 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
13340 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13)
13341 ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm10
13342 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6
13343 ; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13344 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
13345 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm16)
13346 ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11
13347 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
13348 ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12
13349 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
13350 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11))
13351 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
13352 ; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm11
13353 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
13354 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
13355 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm10 & (ymm11 ^ ymm12))
13356 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11
13357 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
13358 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13359 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13360 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
13361 ; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm6
13362 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm11, %ymm11
13363 ; AVX512-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11
13364 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18
13365 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
13366 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm12
13367 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11
13368 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13369 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
13370 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm11
13371 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
13372 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
13373 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
13374 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
13375 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23
13376 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
13377 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
13378 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
13379 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
13380 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
13381 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm11
13382 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm16
13383 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
13384 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
13385 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
13386 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
13387 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
13388 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
13389 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
13390 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
13391 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1
13392 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
13393 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm11 & (zmm1 ^ zmm4))
13394 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13395 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
13396 ; AVX512-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1
13397 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13398 ; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm1
13399 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
13400 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm3
13401 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13402 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
13403 ; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm3
13404 ; AVX512-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3
13405 ; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm7
13406 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm4
13407 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25
13408 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
13409 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13410 ; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm3
13411 ; AVX512-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3
13412 ; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm4
13413 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13414 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
13415 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
13416 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13417 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
13418 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm3
13419 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
13420 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
13421 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
13422 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
13423 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm4
13424 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13425 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13426 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm3
13427 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13428 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
13429 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
13430 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
13431 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm4
13432 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
13433 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
13434 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm4
13435 ; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13436 ; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
13437 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3
13438 ; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13439 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
13440 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
13441 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm5
13442 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
13443 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
13444 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0
13445 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm11 & (zmm0 ^ zmm4))
13446 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13447 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
13448 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
13449 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
13450 ; AVX512-FCP-NEXT: vprold $16, %ymm9, %ymm4
13451 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
13452 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
13453 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
13454 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm4
13455 ; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm16
13456 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
13457 ; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0
13458 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13459 ; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3
13460 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm4))
13461 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13462 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
13463 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3
13464 ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm19
13465 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
13466 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
13467 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9
13468 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm4
13469 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
13470 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
13471 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm5
13472 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm8
13473 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm3
13474 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
13475 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
13476 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
13477 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
13478 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
13479 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6
13480 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
13481 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm27 & (zmm6 ^ zmm5))
13482 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm12
13483 ; AVX512-FCP-NEXT: vprold $16, %ymm26, %ymm3
13484 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm22
13485 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
13486 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
13487 ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
13488 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm5
13489 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm30
13490 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
13491 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
13492 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
13493 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm5
13494 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1
13495 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
13496 ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0
13497 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm5))
13498 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
13499 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13500 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0
13501 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
13502 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
13503 ; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm0
13504 ; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm2
13505 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
13506 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3
13507 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
13508 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
13509 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
13510 ; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13511 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
13512 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
13513 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
13514 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,1,3,8,8,9,9]
13515 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm2
13516 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
13517 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
13518 ; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm6
13519 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13520 ; AVX512-FCP-NEXT: vprold $16, %xmm6, %xmm3
13521 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
13522 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
13523 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
13524 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
13525 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
13526 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
13527 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5
13528 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
13529 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm3 & (zmm5 ^ zmm2))
13530 ; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2
13531 ; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6
13532 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
13533 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13534 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
13535 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
13536 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
13537 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
13538 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
13539 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
13540 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm2
13541 ; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6
13542 ; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8
13543 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31
13544 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
13545 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (zmm21 & (zmm31 ^ zmm2))
13546 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
13547 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm2 & (zmm31 ^ zmm5))
13548 ; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm0
13549 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13550 ; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8
13551 ; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13552 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5
13553 ; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm29
13554 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
13555 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
13556 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
13557 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
13558 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm6
13559 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
13560 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13561 ; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm13
13562 ; AVX512-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13563 ; AVX512-FCP-NEXT: vprold $16, %xmm13, %xmm5
13564 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
13565 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
13566 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
13567 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
13568 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
13569 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm3 & (zmm8 ^ zmm6))
13570 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
13571 ; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5
13572 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
13573 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13574 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
13575 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
13576 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
13577 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm17
13578 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm3
13579 ; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5
13580 ; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
13581 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26
13582 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm3))
13583 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm2 & (zmm26 ^ zmm8))
13584 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13585 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
13586 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13587 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
13588 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
13589 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm3
13590 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
13591 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm9
13592 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
13593 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm3
13594 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13595 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
13596 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
13597 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6]
13598 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
13599 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
13600 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7]
13601 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
13602 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm5
13603 ; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm11
13604 ; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm7
13605 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
13606 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
13607 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
13608 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm1
13609 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
13610 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2
13611 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
13612 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
13613 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm27 & (zmm5 ^ zmm3))
13614 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13615 ; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm2
13616 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
13617 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
13618 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
13619 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
13620 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
13621 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
13622 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
13623 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8
13624 ; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm3
13625 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
13626 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13627 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
13628 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
13629 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10
13630 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
13631 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm1 ^ (zmm21 & (zmm10 ^ zmm1))
13632 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
13633 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
13634 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
13635 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
13636 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
13637 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1
13638 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
13639 ; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22
13640 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = zmm22 ^ (mem & (zmm22 ^ zmm8))
13641 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (mem & (zmm22 ^ zmm5))
13642 ; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm5
13643 ; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13644 ; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
13645 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13646 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
13647 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
13648 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
13649 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
13650 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
13651 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0
13652 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
13653 ; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm6
13654 ; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm5
13655 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
13656 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
13657 ; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1
13658 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
13659 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm1
13660 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
13661 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
13662 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm15
13663 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9
13664 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
13665 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm14
13666 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1
13667 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
13668 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm17, %ymm16
13669 ; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20
13670 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25
13671 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
13672 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = zmm25 ^ (zmm18 & (zmm25 ^ zmm2))
13673 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
13674 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm27 & (zmm25 ^ zmm10))
13675 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
13676 ; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm3
13677 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
13678 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
13679 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm21 & (zmm1 ^ zmm0))
13680 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
13681 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
13682 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3
13683 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
13684 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm2
13685 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
13686 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
13687 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
13688 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
13689 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7
13690 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
13691 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm28 & (zmm7 ^ zmm3))
13692 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
13693 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
13694 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13695 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm16 ^ (ymm18 & (ymm16 ^ ymm0))
13696 ; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13
13697 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
13698 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
13699 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
13700 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm6
13701 ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2
13702 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
13703 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
13704 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
13705 ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3
13706 ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm11
13707 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
13708 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
13709 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8
13710 ; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm5
13711 ; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19
13712 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5
13713 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
13714 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm24 & (zmm5 ^ zmm6))
13715 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13716 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
13717 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
13718 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
13719 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
13720 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13721 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm14
13722 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
13723 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13724 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
13725 ; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm30
13726 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
13727 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17
13728 ; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19
13729 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20
13730 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm8))
13731 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm27 & (zmm20 ^ zmm1))
13732 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
13733 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm7))
13734 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
13735 ; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm8
13736 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
13737 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
13738 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
13739 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
13740 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm8
13741 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm0
13742 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13743 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm7
13744 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13745 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
13746 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
13747 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
13748 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
13749 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
13750 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7
13751 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm28 & (zmm7 ^ zmm8))
13752 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
13753 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
13754 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
13755 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
13756 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
13757 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
13758 ; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3
13759 ; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8
13760 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
13761 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm24 & (zmm3 ^ zmm2))
13762 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ zmm7))
13763 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
13764 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = mem ^ (ymm0 & (ymm13 ^ mem))
13765 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
13766 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm13))
13767 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
13768 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm2 & (ymm4 ^ ymm16))
13769 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
13770 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
13771 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
13772 ; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm0))
13773 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
13774 ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
13775 ; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm2) | mem
13776 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
13777 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
13778 ; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
13779 ; AVX512-FCP-NEXT: # zmm2 = (zmm2 & zmm4) | mem
13780 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13781 ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
13782 ; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
13783 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13784 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
13785 ; AVX512-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
13786 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13787 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
13788 ; AVX512-FCP-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
13789 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4
13790 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
13791 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13792 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
13793 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13794 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13795 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
13796 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13797 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
13798 ; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13799 ; AVX512-FCP-NEXT: # ymm10 = mem[1,1,1,1,5,5,5,5]
13800 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
13801 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
13802 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
13803 ; AVX512-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
13804 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
13805 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
13806 ; AVX512-FCP-NEXT: vpandn %ymm10, %ymm12, %ymm10
13807 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
13808 ; AVX512-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
13809 ; AVX512-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7]
13810 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
13811 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
13812 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
13813 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
13814 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
13815 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
13816 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
13817 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
13818 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
13819 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12))
13820 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
13821 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
13822 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm11 ^ (zmm28 & (zmm4 ^ zmm11))
13823 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
13824 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
13825 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm28 & (zmm7 ^ zmm6))
13826 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
13827 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm6 & (zmm4 ^ zmm0))
13828 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm6 & (zmm7 ^ zmm2))
13829 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13830 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
13831 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
13832 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
13833 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
13834 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13835 ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
13836 ; AVX512-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem)
13837 ; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
13838 ; AVX512-FCP-NEXT: # zmm10 = zmm10 | (zmm1 & mem)
13839 ; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2))
13840 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13841 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax)
13842 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax)
13843 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax)
13844 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
13845 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rax)
13846 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax)
13847 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 704(%rax)
13848 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 640(%rax)
13849 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
13850 ; AVX512-FCP-NEXT: vmovaps %zmm1, 576(%rax)
13851 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax)
13852 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax)
13853 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 512(%rax)
13854 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax)
13855 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax)
13856 ; AVX512-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8
13857 ; AVX512-FCP-NEXT: vzeroupper
13858 ; AVX512-FCP-NEXT: retq
13860 ; AVX512DQ-LABEL: store_i16_stride7_vf64:
13861 ; AVX512DQ: # %bb.0:
13862 ; AVX512DQ-NEXT: subq $2808, %rsp # imm = 0xAF8
13863 ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm6
13864 ; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm13
13865 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7
13866 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm8
13867 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
13868 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm2
13869 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
13870 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm3
13871 ; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm17
13872 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
13873 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm2
13874 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
13875 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm3
13876 ; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm18
13877 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
13878 ; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm2
13879 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13880 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2
13881 ; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm3
13882 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13883 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
13884 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3
13885 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13886 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13887 ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm3
13888 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm2
13889 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
13890 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm4
13891 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3
13892 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm27
13893 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13894 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13895 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm3
13896 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm2
13897 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
13898 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
13899 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm3
13900 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm22
13901 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13902 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13903 ; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2
13904 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13905 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2
13906 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3
13907 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13908 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3
13909 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13910 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13911 ; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2
13912 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13913 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
13914 ; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3
13915 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13916 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
13917 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13918 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13919 ; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2
13920 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13921 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm2
13922 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
13923 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13924 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
13925 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
13926 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13927 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4
13928 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
13929 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm2
13930 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
13931 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
13932 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13933 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm5
13934 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm0
13935 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3
13936 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm1
13937 ; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
13938 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13939 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm1
13940 ; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm9
13941 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm0
13942 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm10
13943 ; AVX512DQ-NEXT: vpor %ymm10, %ymm9, %ymm9
13944 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13945 ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
13946 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1]
13947 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm9
13948 ; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm31
13949 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
13950 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
13951 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
13952 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13953 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
13954 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
13955 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
13956 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
13957 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
13958 ; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13959 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
13960 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
13961 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
13962 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
13963 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
13964 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13965 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
13966 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
13967 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
13968 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
13969 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
13970 ; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13971 ; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm9
13972 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
13973 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
13974 ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13975 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
13976 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
13977 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
13978 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
13979 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
13980 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
13981 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
13982 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
13983 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
13984 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
13985 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
13986 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
13987 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
13988 ; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm8
13989 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm9
13990 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6
13991 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
13992 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
13993 ; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6
13994 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
13995 ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13996 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
13997 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
13998 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
13999 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
14000 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12
14001 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
14002 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
14003 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6
14004 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
14005 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
14006 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
14007 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
14008 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
14009 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
14010 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm6
14011 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[4,5,6,7]
14012 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14013 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
14014 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
14015 ; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm6
14016 ; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm7
14017 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14018 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
14019 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
14020 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm6
14021 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
14022 ; AVX512DQ-NEXT: vpandn %ymm7, %ymm12, %ymm7
14023 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19
14024 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
14025 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14026 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
14027 ; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm6
14028 ; AVX512DQ-NEXT: vpandnq %ymm6, %ymm28, %ymm6
14029 ; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm7
14030 ; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14031 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm7
14032 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
14033 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14034 ; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm7
14035 ; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm8
14036 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
14037 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm17
14038 ; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm20
14039 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
14040 ; AVX512DQ-NEXT: vpshufb %xmm15, %xmm6, %xmm6
14041 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
14042 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14043 ; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm9
14044 ; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm7
14045 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
14046 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
14047 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
14048 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14049 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm14
14050 ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm10
14051 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
14052 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
14053 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
14054 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
14055 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14056 ; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm8
14057 ; AVX512DQ-NEXT: vpandnq %ymm8, %ymm28, %ymm8
14058 ; AVX512DQ-NEXT: vmovdqa (%rax), %ymm12
14059 ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14060 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm13
14061 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
14062 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14063 ; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6
14064 ; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12
14065 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
14066 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm29
14067 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24
14068 ; AVX512DQ-NEXT: vpshufb %xmm15, %xmm8, %xmm8
14069 ; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm25
14070 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
14071 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14072 ; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8
14073 ; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13
14074 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
14075 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
14076 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
14077 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14078 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6
14079 ; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm12
14080 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
14081 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21
14082 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm16
14083 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
14084 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
14085 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
14086 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14087 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14088 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
14089 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
14090 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
14091 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
14092 ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14093 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
14094 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
14095 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
14096 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
14097 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
14098 ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14099 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm15
14100 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm15, %ymm11
14101 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
14102 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
14103 ; AVX512DQ-NEXT: vpandnq %ymm12, %ymm19, %ymm12
14104 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
14105 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14106 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14107 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
14108 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
14109 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
14110 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
14111 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14112 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm6
14113 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm11
14114 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
14115 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
14116 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
14117 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14118 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
14119 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14120 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
14121 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
14122 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
14123 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14124 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
14125 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
14126 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
14127 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
14128 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
14129 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14130 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
14131 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14132 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
14133 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
14134 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
14135 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14136 ; AVX512DQ-NEXT: vprold $16, %ymm0, %ymm2
14137 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
14138 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
14139 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
14140 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14141 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
14142 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
14143 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
14144 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
14145 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
14146 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14147 ; AVX512DQ-NEXT: vpermd %zmm15, %zmm18, %zmm0
14148 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14149 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm12
14150 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
14151 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
14152 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
14153 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
14154 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
14155 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14156 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
14157 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
14158 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
14159 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
14160 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
14161 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14162 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm2
14163 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14164 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
14165 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
14166 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
14167 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
14168 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14169 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm0
14170 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
14171 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm11
14172 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
14173 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
14174 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
14175 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14176 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14177 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14178 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
14179 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
14180 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
14181 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
14182 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
14183 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14184 ; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm0
14185 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
14186 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
14187 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
14188 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14189 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
14190 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14191 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
14192 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
14193 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
14194 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14195 ; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0
14196 ; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm1
14197 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14198 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
14199 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
14200 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14201 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
14202 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14203 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
14204 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
14205 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
14206 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14207 ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm2
14208 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
14209 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
14210 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm12
14211 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4
14212 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
14213 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14214 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
14215 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
14216 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
14217 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
14218 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14219 ; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm4
14220 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
14221 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
14222 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14223 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
14224 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14225 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
14226 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2
14227 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
14228 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
14229 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14230 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
14231 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14232 ; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm0
14233 ; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm1
14234 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14235 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14236 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
14237 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14238 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
14239 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
14240 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm0
14241 ; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm1
14242 ; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm2
14243 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31
14244 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
14245 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
14246 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm9, %xmm1
14247 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
14248 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
14249 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
14250 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
14251 ; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
14252 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
14253 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
14254 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
14255 ; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm3
14256 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
14257 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
14258 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
14259 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
14260 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
14261 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
14262 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1
14263 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3
14264 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14265 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
14266 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
14267 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
14268 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1
14269 ; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm3
14270 ; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5
14271 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25
14272 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
14273 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
14274 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
14275 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm3
14276 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
14277 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
14278 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
14279 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3
14280 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
14281 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2
14282 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
14283 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1
14284 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
14285 ; AVX512DQ-NEXT: vprold $16, %xmm21, %xmm2
14286 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
14287 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
14288 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
14289 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
14290 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
14291 ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0
14292 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
14293 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14294 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
14295 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
14296 ; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
14297 ; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1
14298 ; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2
14299 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
14300 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
14301 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
14302 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
14303 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
14304 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14305 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
14306 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
14307 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
14308 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14309 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
14310 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
14311 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
14312 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
14313 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
14314 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14315 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
14316 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14317 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
14318 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
14319 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
14320 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
14321 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
14322 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14323 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm0
14324 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
14325 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm11
14326 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
14327 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
14328 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
14329 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
14330 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
14331 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
14332 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
14333 ; AVX512DQ-NEXT: vprold $16, %ymm7, %ymm0
14334 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
14335 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
14336 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
14337 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14338 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
14339 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
14340 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
14341 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
14342 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
14343 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
14344 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
14345 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
14346 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
14347 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
14348 ; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14349 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
14350 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
14351 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
14352 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9
14353 ; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
14354 ; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm1
14355 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
14356 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
14357 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14358 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
14359 ; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1
14360 ; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2
14361 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14362 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
14363 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
14364 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm3
14365 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1
14366 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
14367 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm27
14368 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm1
14369 ; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2
14370 ; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4
14371 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13
14372 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
14373 ; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7
14374 ; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6
14375 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14376 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
14377 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
14378 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
14379 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
14380 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14381 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
14382 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14383 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
14384 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
14385 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
14386 ; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14387 ; AVX512DQ-NEXT: # ymm3 = mem[2,1,3,2]
14388 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
14389 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
14390 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
14391 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14392 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
14393 ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
14394 ; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm28) | mem
14395 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14396 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
14397 ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
14398 ; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm28) | mem
14399 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14400 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
14401 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14402 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
14403 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
14404 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
14405 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
14406 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
14407 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
14408 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
14409 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
14410 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
14411 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
14412 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
14413 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14414 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
14415 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14416 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
14417 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
14418 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0
14419 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
14420 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
14421 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14422 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
14423 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
14424 ; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
14425 ; AVX512DQ-NEXT: # zmm19 = zmm19 | (zmm1 & mem)
14426 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
14427 ; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
14428 ; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
14429 ; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
14430 ; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm1) | mem
14431 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1
14432 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm1[4,5,6,7]
14433 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
14434 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
14435 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
14436 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14437 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
14438 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
14439 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
14440 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
14441 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
14442 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
14443 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14444 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
14445 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14446 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
14447 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
14448 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
14449 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14450 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
14451 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
14452 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22
14453 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
14454 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14455 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
14456 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
14457 ; AVX512DQ-NEXT: vpermd 64(%rax), %zmm14, %zmm5
14458 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
14459 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
14460 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1
14461 ; AVX512DQ-NEXT: vpermd (%rax), %zmm14, %zmm14
14462 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
14463 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
14464 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
14465 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
14466 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14467 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
14468 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14469 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
14470 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
14471 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14472 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
14473 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
14474 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
14475 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
14476 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
14477 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
14478 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
14479 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1
14480 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8
14481 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
14482 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
14483 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
14484 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
14485 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
14486 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
14487 ; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
14488 ; AVX512DQ-NEXT: # ymm7 = mem[2,2,2,3]
14489 ; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
14490 ; AVX512DQ-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7]
14491 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
14492 ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
14493 ; AVX512DQ-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7]
14494 ; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
14495 ; AVX512DQ-NEXT: # ymm16 = mem[0,0,2,1]
14496 ; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
14497 ; AVX512DQ-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7]
14498 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
14499 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
14500 ; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
14501 ; AVX512DQ-NEXT: # ymm21 = mem[0,0,1,1]
14502 ; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
14503 ; AVX512DQ-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7]
14504 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
14505 ; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
14506 ; AVX512DQ-NEXT: # ymm24 = mem[2,2,2,3]
14507 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3
14508 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
14509 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
14510 ; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
14511 ; AVX512DQ-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7]
14512 ; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
14513 ; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1]
14514 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
14515 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
14516 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
14517 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
14518 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
14519 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
14520 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
14521 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
14522 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
14523 ; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm10
14524 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
14525 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
14526 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
14527 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
14528 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
14529 ; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm10
14530 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
14531 ; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
14532 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
14533 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
14534 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
14535 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3
14536 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8
14537 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
14538 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1
14539 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3
14540 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
14541 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
14542 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
14543 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
14544 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
14545 ; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
14546 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
14547 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax)
14548 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax)
14549 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax)
14550 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax)
14551 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rax)
14552 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax)
14553 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax)
14554 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 704(%rax)
14555 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rax)
14556 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax)
14557 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rax)
14558 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax)
14559 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%rax)
14560 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 832(%rax)
14561 ; AVX512DQ-NEXT: addq $2808, %rsp # imm = 0xAF8
14562 ; AVX512DQ-NEXT: vzeroupper
14563 ; AVX512DQ-NEXT: retq
14565 ; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64:
14566 ; AVX512DQ-FCP: # %bb.0:
14567 ; AVX512DQ-FCP-NEXT: subq $1496, %rsp # imm = 0x5D8
14568 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3
14569 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
14570 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0
14571 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm1
14572 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
14573 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm2
14574 ; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16
14575 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4
14576 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
14577 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm0
14578 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
14579 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
14580 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm2
14581 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm7
14582 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
14583 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm2
14584 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm0
14585 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
14586 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm12
14587 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
14588 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm2
14589 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm28
14590 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
14591 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14592 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
14593 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
14594 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm31
14595 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0
14596 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
14597 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
14598 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
14599 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14600 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0
14601 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
14602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
14603 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
14604 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
14605 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
14606 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
14607 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14608 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0
14609 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14610 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14
14611 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0
14612 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14613 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm15
14614 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm12
14615 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14616 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0
14617 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14618 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
14619 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0
14620 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14621 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm12
14622 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12
14623 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14624 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0
14625 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14626 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm12
14627 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
14628 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14629 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm14
14630 ; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm14, %ymm12
14631 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14632 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
14633 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8
14634 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm15
14635 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm9
14636 ; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
14637 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14638 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
14639 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
14640 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
14641 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm9
14642 ; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
14643 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14644 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8
14645 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm10
14646 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm9
14647 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11
14648 ; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm10, %ymm22
14649 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm3[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
14650 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,1,3,4,5,5,7]
14651 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
14652 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
14653 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
14654 ; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1]
14655 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm12
14656 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm30
14657 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[1,1,1,1,5,5,5,5]
14658 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7,8,9],ymm13[10],ymm12[11,12],ymm13[13],ymm12[14,15]
14659 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
14660 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm16, %zmm11
14661 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12
14662 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm11 ^ (mem & (zmm12 ^ zmm11))
14663 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
14664 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm7
14665 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
14666 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm12 & ymm13)
14667 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm10
14668 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm6
14669 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14670 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
14671 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm11 & ymm16)
14672 ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm11
14673 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
14674 ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12
14675 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
14676 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm11))
14677 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
14678 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm11
14679 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
14680 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
14681 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm10 & (ymm11 ^ ymm12))
14682 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11
14683 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm11[4,5,6,7]
14684 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14685 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14686 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,1,4,5,4,5,5,7]
14687 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm6
14688 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm11, %ymm11
14689 ; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm13, %ymm11
14690 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18
14691 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
14692 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm12
14693 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm12, %zmm11
14694 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14695 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
14696 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm11
14697 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
14698 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8,9,10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
14699 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
14700 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
14701 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23
14702 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
14703 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
14704 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,2,2,3,10,9,11,11]
14705 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
14706 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
14707 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm11
14708 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm16
14709 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
14710 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
14711 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
14712 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1]
14713 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
14714 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
14715 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
14716 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
14717 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm20, %zmm1
14718 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
14719 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm4 ^ (zmm11 & (zmm1 ^ zmm4))
14720 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14721 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [5,0,0,0,6,0,0,6]
14722 ; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1
14723 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14724 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm1
14725 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,5,0,0,0,6,0,0,30,0,0,0,31,0,0,31]
14726 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm6, %zmm3
14727 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14728 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
14729 ; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm3
14730 ; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3
14731 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm7
14732 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm4
14733 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm25
14734 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
14735 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14736 ; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm3
14737 ; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm6, %ymm3
14738 ; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm4
14739 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14740 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
14741 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
14742 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14743 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
14744 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm3
14745 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
14746 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,0,2,1,4,4,6,5]
14747 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
14748 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
14749 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm4
14750 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14751 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14752 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm3
14753 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14754 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[3,3,3,3,7,7,7,7]
14755 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8,9,10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
14756 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
14757 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm4
14758 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[2,2,2,2,6,6,6,6]
14759 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7,8],ymm4[9],ymm6[10,11],ymm4[12],ymm6[13,14,15]
14760 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm4
14761 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14762 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
14763 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3
14764 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14765 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
14766 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14,15]
14767 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm5
14768 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm15[2,2,2,2,6,6,6,6]
14769 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
14770 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm0
14771 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm4 ^ (zmm11 & (zmm0 ^ zmm4))
14772 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14773 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
14774 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
14775 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
14776 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm9, %ymm4
14777 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
14778 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
14779 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,1,3,2,10,10,10,11]
14780 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm4
14781 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm16
14782 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,21,0,0,0,22,0,0,14,0,0,0,15,0,0,15]
14783 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0
14784 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14785 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm16, %zmm3
14786 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm4))
14787 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14788 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
14789 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm3
14790 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm19
14791 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[2,2,2,2,6,6,6,6]
14792 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
14793 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9
14794 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm4
14795 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[1,1,1,1,5,5,5,5]
14796 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
14797 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm5
14798 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm8
14799 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm3
14800 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[2,2,2,2,6,6,6,6]
14801 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
14802 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
14803 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
14804 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7,8,9],ymm4[10],ymm6[11,12],ymm4[13],ymm6[14,15]
14805 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm11, %zmm6
14806 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
14807 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm27 & (zmm6 ^ zmm5))
14808 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm12
14809 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm26, %ymm3
14810 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm22
14811 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[1,2,2,3,5,6,6,7]
14812 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7,8,9],ymm3[10],ymm5[11,12],ymm3[13],ymm5[14,15]
14813 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
14814 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm5
14815 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm30
14816 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,0,2,1,4,4,6,5]
14817 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15]
14818 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,2,3,3,10,9,11,10]
14819 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm5
14820 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm1, %zmm1
14821 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
14822 ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0
14823 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm5))
14824 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
14825 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14826 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm0
14827 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[3,3,3,3,7,7,7,7]
14828 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
14829 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm0
14830 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm2
14831 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm14 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
14832 ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3
14833 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,2,2]
14834 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
14835 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
14836 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14837 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14838 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
14839 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
14840 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [0,1,1,3,8,8,9,9]
14841 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm2
14842 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
14843 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
14844 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm6
14845 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14846 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm6, %xmm3
14847 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
14848 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
14849 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
14850 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
14851 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
14852 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,1,1,8,8,10,9]
14853 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm28, %zmm5
14854 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
14855 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm3 & (zmm5 ^ zmm2))
14856 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2
14857 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6
14858 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
14859 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14860 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
14861 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
14862 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
14863 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm6
14864 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
14865 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,9,9,11]
14866 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm2
14867 ; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6
14868 ; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8
14869 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm31
14870 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
14871 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (zmm21 & (zmm31 ^ zmm2))
14872 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
14873 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm2 & (zmm31 ^ zmm5))
14874 ; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm0
14875 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14876 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8
14877 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14878 ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm5
14879 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm29
14880 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[1,1,2,2]
14881 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
14882 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
14883 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
14884 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm13, %zmm6
14885 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
14886 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14887 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm13
14888 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14889 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm13, %xmm5
14890 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,3]
14891 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
14892 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
14893 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
14894 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm28, %zmm8
14895 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm3 & (zmm8 ^ zmm6))
14896 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
14897 ; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5
14898 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
14899 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14900 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
14901 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm5
14902 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
14903 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm17
14904 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm26, %zmm3
14905 ; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5
14906 ; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
14907 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm26
14908 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm3))
14909 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm2 & (zmm26 ^ zmm8))
14910 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
14911 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm14[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
14912 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
14913 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm6[2,2,2,2,6,6,6,6]
14914 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
14915 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm3
14916 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[1,1,1,1,5,5,5,5]
14917 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm9
14918 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7,8,9],ymm5[10],ymm3[11,12],ymm5[13],ymm3[14,15]
14919 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm3
14920 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14921 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
14922 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
14923 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[2,2,2,2,6,6,6,6]
14924 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15]
14925 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
14926 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,1,3,4,5,5,7]
14927 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
14928 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm5
14929 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm11
14930 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm7
14931 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
14932 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
14933 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,2,3,8,9,9,11]
14934 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm1
14935 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
14936 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm2
14937 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm19[3,3,3,3,7,7,7,7]
14938 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8,9,10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
14939 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm27 & (zmm5 ^ zmm3))
14940 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14941 ; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm2
14942 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
14943 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[1,2,2,3,5,6,6,7]
14944 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
14945 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
14946 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
14947 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,0,2,1,4,4,6,5]
14948 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
14949 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm8
14950 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm3
14951 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
14952 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
14953 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
14954 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,1,3,3,8,8,9,9]
14955 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm10
14956 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
14957 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm1 ^ (zmm21 & (zmm10 ^ zmm1))
14958 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
14959 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
14960 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
14961 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
14962 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Reload
14963 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm16, %zmm1
14964 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,4,0,0,0,5,0,0,13,0,0,0,14,0,0]
14965 ; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm22
14966 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm22 = zmm22 ^ (mem & (zmm22 ^ zmm8))
14967 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (mem & (zmm22 ^ zmm5))
14968 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm5
14969 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14970 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
14971 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14972 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
14973 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
14974 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
14975 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[3,3,3,3,7,7,7,7]
14976 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6,7,8],ymm0[9],ymm5[10,11],ymm0[12],ymm5[13,14,15]
14977 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm6, %zmm0
14978 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,8,8,9]
14979 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm6
14980 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm5
14981 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
14982 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
14983 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1
14984 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
14985 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm1
14986 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,3,3,3,7,7,7,7]
14987 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7,8,9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
14988 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm15
14989 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9
14990 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
14991 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm14
14992 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm20, %zmm1
14993 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [6,7,3,3,7,7,6,7]
14994 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm17, %ymm16
14995 ; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm20
14996 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm16, %zmm25
14997 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
14998 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm25 = zmm25 ^ (zmm18 & (zmm25 ^ zmm2))
14999 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
15000 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm27 & (zmm25 ^ zmm10))
15001 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
15002 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm3
15003 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
15004 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
15005 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm21 & (zmm1 ^ zmm0))
15006 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
15007 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [0,0,2,1,8,8,9,11]
15008 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm21, %zmm3
15009 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
15010 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm2
15011 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
15012 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,2,2]
15013 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm2[1],xmm7[2,3],xmm2[4],xmm7[5,6],xmm2[7]
15014 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,3,3,4,5,6,7]
15015 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm28, %zmm7
15016 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
15017 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm28 & (zmm7 ^ zmm3))
15018 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
15019 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
15020 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15021 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm16 = ymm16 ^ (ymm18 & (ymm16 ^ ymm0))
15022 ; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm13
15023 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
15024 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
15025 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,1,3,8,8,9,9]
15026 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm6
15027 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm2
15028 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
15029 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[3,3,3,3,7,7,7,7]
15030 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
15031 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3
15032 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm11
15033 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
15034 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
15035 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm30, %zmm8
15036 ; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm5
15037 ; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm19
15038 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm5, %zmm5
15039 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
15040 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (zmm24 & (zmm5 ^ zmm6))
15041 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
15042 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
15043 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[3,3,3,3]
15044 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
15045 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
15046 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15047 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm14
15048 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
15049 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15050 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
15051 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm30
15052 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
15053 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm17, %ymm17
15054 ; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm19
15055 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm20
15056 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm8))
15057 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm27 & (zmm20 ^ zmm1))
15058 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
15059 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm7))
15060 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
15061 ; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm8
15062 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[1,1,2,3]
15063 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
15064 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
15065 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
15066 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm21, %zmm8
15067 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm0
15068 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
15069 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm7
15070 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15071 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
15072 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,2,2]
15073 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6],xmm7[7]
15074 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,3,3,4,5,6,7]
15075 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,0,1,1,8,8,10,9]
15076 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm7
15077 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm8 ^ (zmm28 & (zmm7 ^ zmm8))
15078 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
15079 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
15080 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
15081 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
15082 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [0,1,1,3,8,8,9,9]
15083 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm2
15084 ; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm3
15085 ; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm8
15086 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
15087 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm3 = zmm3 ^ (zmm24 & (zmm3 ^ zmm2))
15088 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ zmm7))
15089 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
15090 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm13 = mem ^ (ymm0 & (ymm13 ^ mem))
15091 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
15092 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm13))
15093 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
15094 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm2 & (ymm4 ^ ymm16))
15095 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
15096 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm0[4,5,6,7]
15097 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
15098 ; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm0))
15099 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm0 # 32-byte Folded Reload
15100 ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
15101 ; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm2) | mem
15102 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
15103 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm2 # 32-byte Folded Reload
15104 ; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
15105 ; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & zmm4) | mem
15106 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
15107 ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
15108 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
15109 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
15110 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
15111 ; AVX512DQ-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
15112 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
15113 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
15114 ; AVX512DQ-FCP-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
15115 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4
15116 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
15117 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15118 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
15119 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
15120 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
15121 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
15122 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15123 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
15124 ; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15125 ; AVX512DQ-FCP-NEXT: # ymm10 = mem[1,1,1,1,5,5,5,5]
15126 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
15127 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
15128 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
15129 ; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
15130 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
15131 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
15132 ; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm12, %ymm10
15133 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
15134 ; AVX512DQ-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
15135 ; AVX512DQ-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7]
15136 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
15137 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
15138 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
15139 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
15140 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,3]
15141 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
15142 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
15143 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
15144 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
15145 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm12))
15146 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
15147 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
15148 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm11 ^ (zmm28 & (zmm4 ^ zmm11))
15149 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
15150 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 32-byte Folded Reload
15151 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm28 & (zmm7 ^ zmm6))
15152 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
15153 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm6 & (zmm4 ^ zmm0))
15154 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm6 & (zmm7 ^ zmm2))
15155 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15156 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm0
15157 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15158 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
15159 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
15160 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15161 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
15162 ; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem)
15163 ; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm10 # 64-byte Folded Reload
15164 ; AVX512DQ-FCP-NEXT: # zmm10 = zmm10 | (zmm1 & mem)
15165 ; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm2))
15166 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15167 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax)
15168 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax)
15169 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax)
15170 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax)
15171 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rax)
15172 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax)
15173 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 704(%rax)
15174 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 640(%rax)
15175 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15176 ; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 576(%rax)
15177 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax)
15178 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rax)
15179 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 512(%rax)
15180 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 832(%rax)
15181 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax)
15182 ; AVX512DQ-FCP-NEXT: addq $1496, %rsp # imm = 0x5D8
15183 ; AVX512DQ-FCP-NEXT: vzeroupper
15184 ; AVX512DQ-FCP-NEXT: retq
15186 ; AVX512BW-LABEL: store_i16_stride7_vf64:
15187 ; AVX512BW: # %bb.0:
15188 ; AVX512BW-NEXT: subq $136, %rsp
15189 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15190 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm14
15191 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29
15192 ; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm15
15193 ; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm9
15194 ; AVX512BW-NEXT: vmovdqa64 (%rdx), %zmm5
15195 ; AVX512BW-NEXT: vmovdqa64 64(%rdx), %zmm25
15196 ; AVX512BW-NEXT: vmovdqa64 (%rcx), %zmm12
15197 ; AVX512BW-NEXT: vmovdqa64 64(%rcx), %zmm13
15198 ; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm4
15199 ; AVX512BW-NEXT: vmovdqa64 64(%r8), %zmm30
15200 ; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm8
15201 ; AVX512BW-NEXT: vmovdqa64 64(%r9), %zmm26
15202 ; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm20
15203 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
15204 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15205 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
15206 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0
15207 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
15208 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
15209 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm3
15210 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3
15211 ; AVX512BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830
15212 ; AVX512BW-NEXT: kmovd %ecx, %k1
15213 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15214 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0]
15215 ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15216 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15217 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
15218 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15219 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0
15220 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0
15221 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
15222 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
15223 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3
15224 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3
15225 ; AVX512BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
15226 ; AVX512BW-NEXT: kmovd %ecx, %k2
15227 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2}
15228 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31]
15229 ; AVX512BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15230 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15231 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
15232 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15233 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm0
15234 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0
15235 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
15236 ; AVX512BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
15237 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm7
15238 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7
15239 ; AVX512BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
15240 ; AVX512BW-NEXT: kmovd %ecx, %k3
15241 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15242 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
15243 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15244 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15245 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm0
15246 ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0
15247 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm3
15248 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm16
15249 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm27
15250 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm17
15251 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
15252 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
15253 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm31
15254 ; AVX512BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31
15255 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm18
15256 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15257 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25
15258 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm19
15259 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29
15260 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
15261 ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15262 ; AVX512BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
15263 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3}
15264 ; AVX512BW-NEXT: kmovd %ecx, %k3
15265 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15266 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
15267 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15268 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10
15269 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10
15270 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
15271 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm1
15272 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm28
15273 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15274 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30
15275 ; AVX512BW-NEXT: vmovdqa64 64(%rax), %zmm25
15276 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30
15277 ; AVX512BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3}
15278 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21
15279 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6
15280 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1}
15281 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
15282 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15283 ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15284 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63]
15285 ; AVX512BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21
15286 ; AVX512BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E
15287 ; AVX512BW-NEXT: kmovd %eax, %k3
15288 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3}
15289 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
15290 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15291 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3
15292 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
15293 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15294 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16
15295 ; AVX512BW-NEXT: movl $202911840, %eax # imm = 0xC183060
15296 ; AVX512BW-NEXT: kmovd %eax, %k3
15297 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3}
15298 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24
15299 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11
15300 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15301 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
15302 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15303 ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15304 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
15305 ; AVX512BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24
15306 ; AVX512BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1
15307 ; AVX512BW-NEXT: kmovd %eax, %k2
15308 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15309 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
15310 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15311 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22
15312 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
15313 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22
15314 ; AVX512BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C
15315 ; AVX512BW-NEXT: kmovd %eax, %k2
15316 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
15317 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2
15318 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21
15319 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3}
15320 ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15321 ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3
15322 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2}
15323 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
15324 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15325 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27
15326 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
15327 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15328 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17
15329 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1}
15330 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2
15331 ; AVX512BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3
15332 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
15333 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
15334 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10
15335 ; AVX512BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3
15336 ; AVX512BW-NEXT: kmovd %eax, %k1
15337 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1}
15338 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0
15339 ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15340 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15341 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
15342 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15343 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18
15344 ; AVX512BW-NEXT: movl $405823681, %eax # imm = 0x183060C1
15345 ; AVX512BW-NEXT: kmovd %eax, %k1
15346 ; AVX512BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1}
15347 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
15348 ; AVX512BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15349 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1
15350 ; AVX512BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23
15351 ; AVX512BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0
15352 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1}
15353 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
15354 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1
15355 ; AVX512BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38
15356 ; AVX512BW-NEXT: kmovd %eax, %k1
15357 ; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1}
15358 ; AVX512BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15359 ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2
15360 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
15361 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
15362 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15363 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15364 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2
15365 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5
15366 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
15367 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15368 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19
15369 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14
15370 ; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3}
15371 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
15372 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15373 ; AVX512BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4
15374 ; AVX512BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28
15375 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
15376 ; AVX512BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4
15377 ; AVX512BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28
15378 ; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3}
15379 ; AVX512BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387
15380 ; AVX512BW-NEXT: kmovd %eax, %k1
15381 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1}
15382 ; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1}
15383 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0]
15384 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15385 ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1
15386 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31]
15387 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
15388 ; AVX512BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2
15389 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
15390 ; AVX512BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4
15391 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31]
15392 ; AVX512BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1
15393 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15394 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rax)
15395 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax)
15396 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax)
15397 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 256(%rax)
15398 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 320(%rax)
15399 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rax)
15400 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax)
15401 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 512(%rax)
15402 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 576(%rax)
15403 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 640(%rax)
15404 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 704(%rax)
15405 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax)
15406 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 768(%rax)
15407 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 832(%rax)
15408 ; AVX512BW-NEXT: addq $136, %rsp
15409 ; AVX512BW-NEXT: vzeroupper
15410 ; AVX512BW-NEXT: retq
15412 ; AVX512BW-FCP-LABEL: store_i16_stride7_vf64:
15413 ; AVX512BW-FCP: # %bb.0:
15414 ; AVX512BW-FCP-NEXT: subq $136, %rsp
15415 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15416 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14
15417 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
15418 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15
15419 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm9
15420 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5
15421 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
15422 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12
15423 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13
15424 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4
15425 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm30
15426 ; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8
15427 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26
15428 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm20
15429 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
15430 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15431 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
15432 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm21, %zmm0
15433 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
15434 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
15435 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3
15436 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm6, %zmm3
15437 ; AVX512BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830
15438 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
15439 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15440 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0]
15441 ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15442 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15443 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
15444 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15445 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
15446 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm0
15447 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
15448 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
15449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
15450 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm11, %zmm3
15451 ; AVX512BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
15452 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2
15453 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2}
15454 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31]
15455 ; AVX512BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15456 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15457 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
15458 ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15459 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
15460 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm0
15461 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
15462 ; AVX512BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
15463 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7
15464 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm7
15465 ; AVX512BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
15466 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
15467 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15468 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
15469 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15470 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15471 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
15472 ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0
15473 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
15474 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16
15475 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
15476 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17
15477 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
15478 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
15479 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31
15480 ; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm23, %zmm31
15481 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18
15482 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15483 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25
15484 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19
15485 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29
15486 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
15487 ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15488 ; AVX512BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
15489 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3}
15490 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3
15491 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15492 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
15493 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
15495 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm0, %zmm10
15496 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
15497 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
15498 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28
15499 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15500 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30
15501 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25
15502 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30
15503 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3}
15504 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm21
15505 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm6
15506 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1}
15507 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
15508 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15509 ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15510 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63]
15511 ; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21
15512 ; AVX512BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E
15513 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
15514 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3}
15515 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
15516 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15517 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3
15518 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
15519 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15520 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm16
15521 ; AVX512BW-FCP-NEXT: movl $202911840, %eax # imm = 0xC183060
15522 ; AVX512BW-FCP-NEXT: kmovd %eax, %k3
15523 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3}
15524 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm24
15525 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm11
15526 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15527 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
15528 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15529 ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15530 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
15531 ; AVX512BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24
15532 ; AVX512BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1
15533 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
15534 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15535 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
15536 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15537 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22
15538 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
15539 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22
15540 ; AVX512BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C
15541 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
15542 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
15543 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm2
15544 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm21
15545 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3}
15546 ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15547 ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm24, %zmm3
15548 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2}
15549 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
15550 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15551 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm2, %zmm27
15552 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
15553 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15554 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm3, %zmm17
15555 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1}
15556 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2
15557 ; AVX512BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3
15558 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
15559 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
15560 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10
15561 ; AVX512BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3
15562 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
15563 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1}
15564 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm0
15565 ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15566 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15567 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
15568 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15569 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm18
15570 ; AVX512BW-FCP-NEXT: movl $405823681, %eax # imm = 0x183060C1
15571 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
15572 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1}
15573 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
15574 ; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15575 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm2, %zmm1
15576 ; AVX512BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23
15577 ; AVX512BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0
15578 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1}
15579 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
15580 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1
15581 ; AVX512BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38
15582 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
15583 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1}
15584 ; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15585 ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm10, %zmm2
15586 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
15587 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
15588 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15589 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15590 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm1, %zmm2
15591 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm5
15592 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
15593 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15594 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm19
15595 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm1, %zmm14
15596 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3}
15597 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
15598 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15599 ; AVX512BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4
15600 ; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28
15601 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
15602 ; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4
15603 ; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28
15604 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3}
15605 ; AVX512BW-FCP-NEXT: movl $1893843847, %eax # imm = 0x70E1C387
15606 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
15607 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1}
15608 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1}
15609 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0]
15610 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15611 ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1
15612 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31]
15613 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
15614 ; AVX512BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2
15615 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
15616 ; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4
15617 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31]
15618 ; AVX512BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1
15619 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15620 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
15621 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
15622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
15623 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax)
15624 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rax)
15625 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
15626 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
15627 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax)
15628 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax)
15629 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax)
15630 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rax)
15631 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
15632 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax)
15633 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax)
15634 ; AVX512BW-FCP-NEXT: addq $136, %rsp
15635 ; AVX512BW-FCP-NEXT: vzeroupper
15636 ; AVX512BW-FCP-NEXT: retq
15638 ; AVX512DQ-BW-LABEL: store_i16_stride7_vf64:
15639 ; AVX512DQ-BW: # %bb.0:
15640 ; AVX512DQ-BW-NEXT: subq $136, %rsp
15641 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15642 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm14
15643 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29
15644 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm15
15645 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm9
15646 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %zmm5
15647 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdx), %zmm25
15648 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %zmm12
15649 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rcx), %zmm13
15650 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm4
15651 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r8), %zmm30
15652 ; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm8
15653 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%r9), %zmm26
15654 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm20
15655 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
15656 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15657 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
15658 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm21, %zmm0
15659 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
15660 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
15661 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm3
15662 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm6, %zmm3
15663 ; AVX512DQ-BW-NEXT: movl $101455920, %ecx # imm = 0x60C1830
15664 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
15665 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15666 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0]
15667 ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15668 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15669 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
15670 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15671 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0
15672 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm0
15673 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
15674 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
15675 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3
15676 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm11, %zmm3
15677 ; AVX512DQ-BW-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
15678 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k2
15679 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2}
15680 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31]
15681 ; AVX512DQ-BW-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15682 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15683 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
15684 ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15685 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm0
15686 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm0
15687 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
15688 ; AVX512DQ-BW-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
15689 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm7
15690 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm28, %zmm7
15691 ; AVX512DQ-BW-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
15692 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
15693 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15694 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
15695 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15696 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15697 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm0
15698 ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm0
15699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm3
15700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm16
15701 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm27
15702 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm17
15703 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
15704 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
15705 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm31
15706 ; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm23, %zmm31
15707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm18
15708 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15709 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm25
15710 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm19
15711 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm28, %zmm29
15712 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
15713 ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15714 ; AVX512DQ-BW-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
15715 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3}
15716 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k3
15717 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15718 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
15719 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15720 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10
15721 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm0, %zmm10
15722 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22
15723 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm1
15724 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm28
15725 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15726 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm30
15727 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rax), %zmm25
15728 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm30
15729 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3}
15730 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm21
15731 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm12, %zmm6
15732 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1}
15733 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
15734 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15735 ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15736 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63]
15737 ; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm2, %zmm21
15738 ; AVX512DQ-BW-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E
15739 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
15740 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3}
15741 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
15742 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15743 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm2, %zmm3
15744 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
15745 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15746 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm21, %zmm16
15747 ; AVX512DQ-BW-NEXT: movl $202911840, %eax # imm = 0xC183060
15748 ; AVX512DQ-BW-NEXT: kmovd %eax, %k3
15749 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3}
15750 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm24
15751 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm11
15752 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15753 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
15754 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15755 ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15756 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
15757 ; AVX512DQ-BW-NEXT: vpermi2w %zmm20, %zmm3, %zmm24
15758 ; AVX512DQ-BW-NEXT: movl $473460961, %eax # imm = 0x1C3870E1
15759 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
15760 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15761 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
15762 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15763 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm3, %zmm22
15764 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
15765 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm24, %zmm22
15766 ; AVX512DQ-BW-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C
15767 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
15768 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
15769 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm2
15770 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm21
15771 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3}
15772 ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15773 ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm24, %zmm3
15774 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2}
15775 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
15776 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15777 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm2, %zmm27
15778 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
15779 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15780 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm3, %zmm17
15781 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1}
15782 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm2
15783 ; AVX512DQ-BW-NEXT: vpermi2w %zmm15, %zmm14, %zmm3
15784 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
15785 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
15786 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm2, %zmm10
15787 ; AVX512DQ-BW-NEXT: movl $946921923, %eax # imm = 0x3870E1C3
15788 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
15789 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1}
15790 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm8, %zmm0
15791 ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15792 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15793 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
15794 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15795 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm0, %zmm18
15796 ; AVX512DQ-BW-NEXT: movl $405823681, %eax # imm = 0x183060C1
15797 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
15798 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1}
15799 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
15800 ; AVX512DQ-BW-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15801 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm2, %zmm1
15802 ; AVX512DQ-BW-NEXT: vpermi2w %zmm14, %zmm15, %zmm23
15803 ; AVX512DQ-BW-NEXT: vpermi2w %zmm12, %zmm5, %zmm0
15804 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1}
15805 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
15806 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm10, %zmm1
15807 ; AVX512DQ-BW-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38
15808 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
15809 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1}
15810 ; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15811 ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm10, %zmm2
15812 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
15813 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
15814 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15815 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15816 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm1, %zmm2
15817 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm1, %zmm5
15818 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
15819 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15820 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm1, %zmm19
15821 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm1, %zmm14
15822 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3}
15823 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
15824 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15825 ; AVX512DQ-BW-NEXT: vpermt2w %zmm8, %zmm1, %zmm4
15826 ; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm1, %zmm28
15827 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
15828 ; AVX512DQ-BW-NEXT: vpermt2w %zmm20, %zmm1, %zmm4
15829 ; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm1, %zmm28
15830 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3}
15831 ; AVX512DQ-BW-NEXT: movl $1893843847, %eax # imm = 0x70E1C387
15832 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
15833 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1}
15834 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1}
15835 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0]
15836 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15837 ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm2, %zmm1
15838 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31]
15839 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
15840 ; AVX512DQ-BW-NEXT: vpermi2w %zmm26, %zmm4, %zmm2
15841 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
15842 ; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm1, %zmm4
15843 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31]
15844 ; AVX512DQ-BW-NEXT: vpermi2w %zmm25, %zmm2, %zmm1
15845 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
15846 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rax)
15847 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax)
15848 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax)
15849 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 256(%rax)
15850 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 320(%rax)
15851 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 384(%rax)
15852 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 448(%rax)
15853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 512(%rax)
15854 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 576(%rax)
15855 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 640(%rax)
15856 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 704(%rax)
15857 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax)
15858 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 768(%rax)
15859 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 832(%rax)
15860 ; AVX512DQ-BW-NEXT: addq $136, %rsp
15861 ; AVX512DQ-BW-NEXT: vzeroupper
15862 ; AVX512DQ-BW-NEXT: retq
15864 ; AVX512DQ-BW-FCP-LABEL: store_i16_stride7_vf64:
15865 ; AVX512DQ-BW-FCP: # %bb.0:
15866 ; AVX512DQ-BW-FCP-NEXT: subq $136, %rsp
15867 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15868 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm14
15869 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
15870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm15
15871 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm9
15872 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %zmm5
15873 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdx), %zmm25
15874 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %zmm12
15875 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rcx), %zmm13
15876 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm4
15877 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r8), %zmm30
15878 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm8
15879 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%r9), %zmm26
15880 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm20
15881 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0,0,0,30,62,28,60,0,0,0,31,63,29,61,0,0,0]
15882 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15883 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
15884 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm21, %zmm0
15885 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0,27,0,0,0,62,30,60,28,0,0,0,63,31,61,29,0]
15886 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
15887 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3
15888 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm6, %zmm3
15889 ; AVX512DQ-BW-FCP-NEXT: movl $101455920, %ecx # imm = 0x60C1830
15890 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
15891 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
15892 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,59,0,0,4,5,6,7,60,0,0,11,12,13,14,61,0,0,18,19,20,21,62,0,0,25,26,27,28,63,0,0]
15893 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15894 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15895 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
15896 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
15897 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
15898 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm0
15899 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,59,25]
15900 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
15901 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
15902 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm11, %zmm3
15903 ; AVX512DQ-BW-FCP-NEXT: movl $-2096755688, %ecx # imm = 0x83060C18
15904 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2
15905 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k2}
15906 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,4,55,0,0,8,9,10,11,56,0,0,15,16,17,18,57,0,0,22,23,24,25,58,0,0,29,30,31]
15907 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm30, %zmm3, %zmm0
15908 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
15909 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
15910 ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
15911 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
15912 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm19, %zmm0
15913 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm28 = [0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34]
15914 ; AVX512DQ-BW-FCP-NEXT: # zmm28 = mem[0,1,2,3,0,1,2,3]
15915 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7
15916 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm28, %zmm7
15917 ; AVX512DQ-BW-FCP-NEXT: movl $-1048377844, %ecx # imm = 0xC183060C
15918 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
15919 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15920 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
15921 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
15922 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15923 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
15924 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm0
15925 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm3
15926 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm16
15927 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm27
15928 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm17
15929 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0,9,0,0,0,44,12,42,10,0,0,0,45,13,43,11,0]
15930 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
15931 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm31
15932 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm23, %zmm31
15933 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18
15934 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15935 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm25
15936 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm19
15937 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm28, %zmm29
15938 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,0,0,0,4,5,32,0,0,0,0,11,12,33,0,0,0,0,18,19,34,0,0,0,0,25,26,35,0,0,0,0]
15939 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
15940 ; AVX512DQ-BW-FCP-NEXT: movl $236730480, %ecx # imm = 0xE1C3870
15941 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm29 {%k3}
15942 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3
15943 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm7 {%k3}
15944 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
15945 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
15946 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
15947 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm0, %zmm10
15948 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
15949 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm1
15950 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm28
15951 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15952 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm25, %zmm30
15953 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rax), %zmm25
15954 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm30
15955 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm30, %zmm29 {%k3}
15956 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm21
15957 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm12, %zmm6
15958 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k1}
15959 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
15960 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15961 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
15962 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [0,1,2,59,0,0,0,0,8,9,60,0,0,0,0,15,16,61,0,0,0,0,22,23,62,0,0,0,0,29,30,63]
15963 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm2, %zmm21
15964 ; AVX512DQ-BW-FCP-NEXT: movl $-507279602, %eax # imm = 0xE1C3870E
15965 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
15966 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm6 {%k3}
15967 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0,0,0,0,21,53,19,51,0,0,0,22,54,20,52,0,0]
15968 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
15969 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm2, %zmm3
15970 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52]
15971 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
15972 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm21, %zmm16
15973 ; AVX512DQ-BW-FCP-NEXT: movl $202911840, %eax # imm = 0xC183060
15974 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
15975 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm16 {%k3}
15976 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm24
15977 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm11
15978 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15979 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
15980 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15981 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15982 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [54,0,0,0,0,5,6,55,0,0,0,0,12,13,56,0,0,0,0,19,20,57,0,0,0,0,26,27,58,0,0,0]
15983 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm20, %zmm3, %zmm24
15984 ; AVX512DQ-BW-FCP-NEXT: movl $473460961, %eax # imm = 0x1C3870E1
15985 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
15986 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm11 {%k2}
15987 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
15988 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
15989 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm3, %zmm22
15990 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [0,0,2,3,50,0,0,0,0,9,10,51,0,0,0,0,16,17,52,0,0,0,0,23,24,53,0,0,0,0,30,31]
15991 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm24, %zmm22
15992 ; AVX512DQ-BW-FCP-NEXT: movl $-1014559204, %eax # imm = 0xC3870E1C
15993 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
15994 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
15995 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm2
15996 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm21
15997 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm21 {%k3}
15998 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm3
15999 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm24, %zmm3
16000 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm21 {%k2}
16001 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0,0,0,16,48,14,46,0,0,0,17,49,15,47,0,0,0]
16002 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
16003 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm2, %zmm27
16004 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50,16,48,14,46,0,0,0,17,49,15,47,0,0,0,18,50]
16005 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
16006 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm3, %zmm17
16007 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm17 {%k1}
16008 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm2
16009 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm15, %zmm14, %zmm3
16010 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1}
16011 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,45,0,0,0,0,6,7,46,0,0,0,0,13,14,47,0,0,0,0,20,21,48,0,0,0,0,27,28,49,0,0]
16012 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm2, %zmm10
16013 ; AVX512DQ-BW-FCP-NEXT: movl $946921923, %eax # imm = 0x3870E1C3
16014 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
16015 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm17 {%k1}
16016 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm8, %zmm0
16017 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm2, %zmm0
16018 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1}
16019 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,45,11]
16020 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
16021 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm0, %zmm18
16022 ; AVX512DQ-BW-FCP-NEXT: movl $405823681, %eax # imm = 0x183060C1
16023 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
16024 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm31, %zmm18 {%k1}
16025 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
16026 ; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
16027 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm2, %zmm1
16028 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm14, %zmm15, %zmm23
16029 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm12, %zmm5, %zmm0
16030 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm0 {%k1}
16031 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm10 = [0,0,0,3,4,41,0,0,0,0,10,11,42,0,0,0,0,17,18,43,0,0,0,0,24,25,44,0,0,0,0,31]
16032 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm10, %zmm1
16033 ; AVX512DQ-BW-FCP-NEXT: movl $-2029118408, %eax # imm = 0x870E1C38
16034 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
16035 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm18 {%k1}
16036 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm4, %zmm2
16037 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm10, %zmm2
16038 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1}
16039 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,0,0,7,39,5,37,0,0,0,8,40,6,38,0,0]
16040 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
16041 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16042 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm1, %zmm2
16043 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm1, %zmm5
16044 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9,0,7,39,5,37,0,0,0,8,40,6,38,0,0,0,9]
16045 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
16046 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm1, %zmm19
16047 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm1, %zmm14
16048 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm19 {%k3}
16049 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
16050 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
16051 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm8, %zmm1, %zmm4
16052 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm1, %zmm28
16053 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,36,0,0,0,0,7,8,37,0,0,0,0,14,15,38,0,0,0,0,21,22,39,0,0,0,0,28,29,40,0]
16054 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm1, %zmm4
16055 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm1, %zmm28
16056 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm14 {%k3}
16057 ; AVX512DQ-BW-FCP-NEXT: movl $1893843847, %eax # imm = 0x70E1C387
16058 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
16059 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm19 {%k1}
16060 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm14 {%k1}
16061 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,1,59,0,4,5,6,7,8,60,0,11,12,13,14,15,61,0,18,19,20,21,22,62,0,25,26,27,28,29,63,0]
16062 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16063 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm2, %zmm1
16064 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,55,0,8,9,10,11,12,56,0,15,16,17,18,19,57,0,22,23,24,25,26,58,0,29,30,31]
16065 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
16066 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm26, %zmm4, %zmm2
16067 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
16068 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm1, %zmm4
16069 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [54,1,2,3,4,5,6,55,8,9,10,11,12,13,56,15,16,17,18,19,20,57,22,23,24,25,26,27,58,29,30,31]
16070 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm25, %zmm2, %zmm1
16071 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
16073 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
16074 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
16075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rax)
16076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 320(%rax)
16077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
16078 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
16079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 512(%rax)
16080 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 576(%rax)
16081 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 640(%rax)
16082 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 704(%rax)
16083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
16084 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 768(%rax)
16085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 832(%rax)
16086 ; AVX512DQ-BW-FCP-NEXT: addq $136, %rsp
16087 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
16088 ; AVX512DQ-BW-FCP-NEXT: retq
16089 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64
16090 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64
16091 %in.vec2 = load <64 x i16>, ptr %in.vecptr2, align 64
16092 %in.vec3 = load <64 x i16>, ptr %in.vecptr3, align 64
16093 %in.vec4 = load <64 x i16>, ptr %in.vecptr4, align 64
16094 %in.vec5 = load <64 x i16>, ptr %in.vecptr5, align 64
16095 %in.vec6 = load <64 x i16>, ptr %in.vecptr6, align 64
16096 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
16097 %2 = shufflevector <64 x i16> %in.vec2, <64 x i16> %in.vec3, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
16098 %3 = shufflevector <64 x i16> %in.vec4, <64 x i16> %in.vec5, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
16099 %4 = shufflevector <128 x i16> %1, <128 x i16> %2, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255>
16100 %5 = shufflevector <64 x i16> %in.vec6, <64 x i16> poison, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
16101 %6 = shufflevector <128 x i16> %3, <128 x i16> %5, <192 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191>
16102 %7 = shufflevector <192 x i16> %6, <192 x i16> poison, <256 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
16103 %8 = shufflevector <256 x i16> %4, <256 x i16> %7, <448 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 136, i32 137, i32 138, i32 139, i32 140, i32 141, i32 142, i32 143, i32 144, i32 145, i32 146, i32 147, i32 148, i32 149, i32 150, i32 151, i32 152, i32 153, i32 154, i32 155, i32 156, i32 157, i32 158, i32 159, i32 160, i32 161, i32 162, i32 163, i32 164, i32 165, i32 166, i32 167, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199, i32 200, i32 201, i32 202, i32 203, i32 204, i32 205, i32 206, i32 207, i32 208, i32 209, i32 210, i32 211, i32 212, i32 213, i32 214, i32 215, i32 216, i32 217, i32 218, i32 219, i32 220, i32 221, i32 222, i32 223, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 248, i32 249, i32 250, i32 251, i32 252, i32 253, i32 254, i32 255, i32 256, i32 257, i32 258, i32 259, i32 260, i32 261, i32 262, i32 263, i32 264, i32 265, i32 266, i32 267, i32 268, i32 269, i32 270, i32 271, i32 272, i32 273, i32 274, i32 275, i32 276, i32 277, i32 278, i32 279, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 304, i32 305, i32 306, i32 307, i32 308, i32 309, i32 310, i32 311, i32 312, i32 313, i32 314, i32 315, i32 316, i32 317, i32 318, i32 319, i32 320, i32 321, i32 322, i32 323, i32 324, i32 325, i32 326, i32 327, i32 328, i32 329, i32 330, i32 331, i32 332, i32 333, i32 334, i32 335, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 360, i32 361, i32 362, i32 363, i32 364, i32 365, i32 366, i32 367, i32 368, i32 369, i32 370, i32 371, i32 372, i32 373, i32 374, i32 375, i32 376, i32 377, i32 378, i32 379, i32 380, i32 381, i32 382, i32 383, i32 384, i32 385, i32 386, i32 387, i32 388, i32 389, i32 390, i32 391, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 416, i32 417, i32 418, i32 419, i32 420, i32 421, i32 422, i32 423, i32 424, i32 425, i32 426, i32 427, i32 428, i32 429, i32 430, i32 431, i32 432, i32 433, i32 434, i32 435, i32 436, i32 437, i32 438, i32 439, i32 440, i32 441, i32 442, i32 443, i32 444, i32 445, i32 446, i32 447>
16104 %interleaved.vec = shufflevector <448 x i16> %8, <448 x i16> poison, <448 x i32> <i32 0, i32 64, i32 128, i32 192, i32 256, i32 320, i32 384, i32 1, i32 65, i32 129, i32 193, i32 257, i32 321, i32 385, i32 2, i32 66, i32 130, i32 194, i32 258, i32 322, i32 386, i32 3, i32 67, i32 131, i32 195, i32 259, i32 323, i32 387, i32 4, i32 68, i32 132, i32 196, i32 260, i32 324, i32 388, i32 5, i32 69, i32 133, i32 197, i32 261, i32 325, i32 389, i32 6, i32 70, i32 134, i32 198, i32 262, i32 326, i32 390, i32 7, i32 71, i32 135, i32 199, i32 263, i32 327, i32 391, i32 8, i32 72, i32 136, i32 200, i32 264, i32 328, i32 392, i32 9, i32 73, i32 137, i32 201, i32 265, i32 329, i32 393, i32 10, i32 74, i32 138, i32 202, i32 266, i32 330, i32 394, i32 11, i32 75, i32 139, i32 203, i32 267, i32 331, i32 395, i32 12, i32 76, i32 140, i32 204, i32 268, i32 332, i32 396, i32 13, i32 77, i32 141, i32 205, i32 269, i32 333, i32 397, i32 14, i32 78, i32 142, i32 206, i32 270, i32 334, i32 398, i32 15, i32 79, i32 143, i32 207, i32 271, i32 335, i32 399, i32 16, i32 80, i32 144, i32 208, i32 272, i32 336, i32 400, i32 17, i32 81, i32 145, i32 209, i32 273, i32 337, i32 401, i32 18, i32 82, i32 146, i32 210, i32 274, i32 338, i32 402, i32 19, i32 83, i32 147, i32 211, i32 275, i32 339, i32 403, i32 20, i32 84, i32 148, i32 212, i32 276, i32 340, i32 404, i32 21, i32 85, i32 149, i32 213, i32 277, i32 341, i32 405, i32 22, i32 86, i32 150, i32 214, i32 278, i32 342, i32 406, i32 23, i32 87, i32 151, i32 215, i32 279, i32 343, i32 407, i32 24, i32 88, i32 152, i32 216, i32 280, i32 344, i32 408, i32 25, i32 89, i32 153, i32 217, i32 281, i32 345, i32 409, i32 26, i32 90, i32 154, i32 218, i32 282, i32 346, i32 410, i32 27, i32 91, i32 155, i32 219, i32 283, i32 347, i32 411, i32 28, i32 92, i32 156, i32 220, i32 284, i32 348, i32 412, i32 29, i32 93, i32 157, i32 221, i32 285, i32 349, i32 413, i32 30, i32 94, i32 158, i32 222, i32 286, i32 350, i32 414, i32 31, i32 95, i32 159, i32 223, i32 287, i32 351, i32 415, i32 32, i32 96, i32 160, i32 224, i32 288, i32 352, i32 416, i32 33, i32 97, i32 161, i32 225, i32 289, i32 353, i32 417, i32 34, i32 98, i32 162, i32 226, i32 290, i32 354, i32 418, i32 35, i32 99, i32 163, i32 227, i32 291, i32 355, i32 419, i32 36, i32 100, i32 164, i32 228, i32 292, i32 356, i32 420, i32 37, i32 101, i32 165, i32 229, i32 293, i32 357, i32 421, i32 38, i32 102, i32 166, i32 230, i32 294, i32 358, i32 422, i32 39, i32 103, i32 167, i32 231, i32 295, i32 359, i32 423, i32 40, i32 104, i32 168, i32 232, i32 296, i32 360, i32 424, i32 41, i32 105, i32 169, i32 233, i32 297, i32 361, i32 425, i32 42, i32 106, i32 170, i32 234, i32 298, i32 362, i32 426, i32 43, i32 107, i32 171, i32 235, i32 299, i32 363, i32 427, i32 44, i32 108, i32 172, i32 236, i32 300, i32 364, i32 428, i32 45, i32 109, i32 173, i32 237, i32 301, i32 365, i32 429, i32 46, i32 110, i32 174, i32 238, i32 302, i32 366, i32 430, i32 47, i32 111, i32 175, i32 239, i32 303, i32 367, i32 431, i32 48, i32 112, i32 176, i32 240, i32 304, i32 368, i32 432, i32 49, i32 113, i32 177, i32 241, i32 305, i32 369, i32 433, i32 50, i32 114, i32 178, i32 242, i32 306, i32 370, i32 434, i32 51, i32 115, i32 179, i32 243, i32 307, i32 371, i32 435, i32 52, i32 116, i32 180, i32 244, i32 308, i32 372, i32 436, i32 53, i32 117, i32 181, i32 245, i32 309, i32 373, i32 437, i32 54, i32 118, i32 182, i32 246, i32 310, i32 374, i32 438, i32 55, i32 119, i32 183, i32 247, i32 311, i32 375, i32 439, i32 56, i32 120, i32 184, i32 248, i32 312, i32 376, i32 440, i32 57, i32 121, i32 185, i32 249, i32 313, i32 377, i32 441, i32 58, i32 122, i32 186, i32 250, i32 314, i32 378, i32 442, i32 59, i32 123, i32 187, i32 251, i32 315, i32 379, i32 443, i32 60, i32 124, i32 188, i32 252, i32 316, i32 380, i32 444, i32 61, i32 125, i32 189, i32 253, i32 317, i32 381, i32 445, i32 62, i32 126, i32 190, i32 254, i32 318, i32 382, i32 446, i32 63, i32 127, i32 191, i32 255, i32 319, i32 383, i32 447>
16105 store <448 x i16> %interleaved.vec, ptr %out.vec, align 64