1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19 ; SSE-LABEL: load_i16_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movdqa (%rdi), %xmm0
24 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
25 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
27 ; SSE-NEXT: movdqa %xmm0, %xmm3
28 ; SSE-NEXT: psrld $16, %xmm3
29 ; SSE-NEXT: movdqa %xmm3, %xmm4
30 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
31 ; SSE-NEXT: movdqa %xmm0, %xmm5
32 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
34 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
35 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
36 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
37 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,2,3]
38 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
39 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
40 ; SSE-NEXT: psrlq $48, %xmm1
41 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
42 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
43 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
44 ; SSE-NEXT: movd %xmm2, (%rsi)
45 ; SSE-NEXT: movd %xmm4, (%rdx)
46 ; SSE-NEXT: movd %xmm6, (%rcx)
47 ; SSE-NEXT: movd %xmm5, (%r8)
48 ; SSE-NEXT: movd %xmm7, (%r9)
49 ; SSE-NEXT: movd %xmm3, (%r10)
50 ; SSE-NEXT: movd %xmm0, (%rax)
53 ; AVX-LABEL: load_i16_stride7_vf2:
55 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
56 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
57 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
58 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
59 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
60 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
61 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm3
62 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
63 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
64 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
65 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
66 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
67 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm7
68 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
69 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
70 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
71 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
72 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
73 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
74 ; AVX-NEXT: vmovd %xmm2, (%rsi)
75 ; AVX-NEXT: vmovd %xmm4, (%rdx)
76 ; AVX-NEXT: vmovd %xmm6, (%rcx)
77 ; AVX-NEXT: vpextrd $2, %xmm5, (%r8)
78 ; AVX-NEXT: vmovd %xmm7, (%r9)
79 ; AVX-NEXT: vmovd %xmm3, (%r10)
80 ; AVX-NEXT: vmovd %xmm0, (%rax)
83 ; AVX2-LABEL: load_i16_stride7_vf2:
85 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
86 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
87 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
88 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
89 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
90 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
91 ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm3
92 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
93 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
94 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
95 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
96 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
97 ; AVX2-NEXT: vpbroadcastw 8(%rdi), %xmm7
98 ; AVX2-NEXT: vpsrlq $48, %xmm1, %xmm8
99 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
100 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
101 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
102 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
103 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
104 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
105 ; AVX2-NEXT: vmovd %xmm4, (%rdx)
106 ; AVX2-NEXT: vmovd %xmm6, (%rcx)
107 ; AVX2-NEXT: vpextrd $2, %xmm5, (%r8)
108 ; AVX2-NEXT: vmovd %xmm7, (%r9)
109 ; AVX2-NEXT: vmovd %xmm3, (%r10)
110 ; AVX2-NEXT: vmovd %xmm0, (%rax)
113 ; AVX2-FP-LABEL: load_i16_stride7_vf2:
115 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
116 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
117 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
118 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
119 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
120 ; AVX2-FP-NEXT: vpsrld $16, %xmm0, %xmm3
121 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
122 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
123 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
124 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
125 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
126 ; AVX2-FP-NEXT: vpbroadcastw 8(%rdi), %xmm8
127 ; AVX2-FP-NEXT: vpsrlq $48, %xmm1, %xmm9
128 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
129 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
130 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
131 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
132 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
133 ; AVX2-FP-NEXT: vmovd %xmm4, (%rdx)
134 ; AVX2-FP-NEXT: vmovd %xmm7, (%rcx)
135 ; AVX2-FP-NEXT: vmovd %xmm5, (%r8)
136 ; AVX2-FP-NEXT: vmovd %xmm8, (%r9)
137 ; AVX2-FP-NEXT: vmovd %xmm3, (%r10)
138 ; AVX2-FP-NEXT: vmovd %xmm0, (%rax)
141 ; AVX2-FCP-LABEL: load_i16_stride7_vf2:
143 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
144 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
145 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
147 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
148 ; AVX2-FCP-NEXT: vpsrld $16, %xmm0, %xmm3
149 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
150 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
151 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
152 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
153 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
154 ; AVX2-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm8
155 ; AVX2-FCP-NEXT: vpsrlq $48, %xmm1, %xmm9
156 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
157 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
158 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
159 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
160 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
161 ; AVX2-FCP-NEXT: vmovd %xmm4, (%rdx)
162 ; AVX2-FCP-NEXT: vmovd %xmm7, (%rcx)
163 ; AVX2-FCP-NEXT: vmovd %xmm5, (%r8)
164 ; AVX2-FCP-NEXT: vmovd %xmm8, (%r9)
165 ; AVX2-FCP-NEXT: vmovd %xmm3, (%r10)
166 ; AVX2-FCP-NEXT: vmovd %xmm0, (%rax)
167 ; AVX2-FCP-NEXT: retq
169 ; AVX512-LABEL: load_i16_stride7_vf2:
171 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
172 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
173 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
174 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
175 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
176 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
177 ; AVX512-NEXT: vpsrld $16, %xmm0, %xmm3
178 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
179 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
180 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
181 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
182 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
183 ; AVX512-NEXT: vpbroadcastw 8(%rdi), %xmm7
184 ; AVX512-NEXT: vpsrlq $48, %xmm1, %xmm8
185 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
186 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
187 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
188 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
189 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
190 ; AVX512-NEXT: vmovd %xmm2, (%rsi)
191 ; AVX512-NEXT: vmovd %xmm4, (%rdx)
192 ; AVX512-NEXT: vmovd %xmm6, (%rcx)
193 ; AVX512-NEXT: vpextrd $2, %xmm5, (%r8)
194 ; AVX512-NEXT: vmovd %xmm7, (%r9)
195 ; AVX512-NEXT: vmovd %xmm3, (%r10)
196 ; AVX512-NEXT: vmovd %xmm0, (%rax)
199 ; AVX512-FCP-LABEL: load_i16_stride7_vf2:
200 ; AVX512-FCP: # %bb.0:
201 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
202 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
203 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
204 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
205 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
206 ; AVX512-FCP-NEXT: vpsrld $16, %xmm0, %xmm3
207 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
208 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
209 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
210 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
211 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
212 ; AVX512-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm8
213 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm1, %xmm9
214 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
215 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
216 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
217 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
218 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi)
219 ; AVX512-FCP-NEXT: vmovd %xmm4, (%rdx)
220 ; AVX512-FCP-NEXT: vmovd %xmm7, (%rcx)
221 ; AVX512-FCP-NEXT: vmovd %xmm5, (%r8)
222 ; AVX512-FCP-NEXT: vmovd %xmm8, (%r9)
223 ; AVX512-FCP-NEXT: vmovd %xmm3, (%r10)
224 ; AVX512-FCP-NEXT: vmovd %xmm0, (%rax)
225 ; AVX512-FCP-NEXT: retq
227 ; AVX512DQ-LABEL: load_i16_stride7_vf2:
229 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
230 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
231 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
232 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
233 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
234 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
235 ; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm3
236 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
237 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
238 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
239 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
240 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
241 ; AVX512DQ-NEXT: vpbroadcastw 8(%rdi), %xmm7
242 ; AVX512DQ-NEXT: vpsrlq $48, %xmm1, %xmm8
243 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
244 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
245 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
246 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
247 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
248 ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi)
249 ; AVX512DQ-NEXT: vmovd %xmm4, (%rdx)
250 ; AVX512DQ-NEXT: vmovd %xmm6, (%rcx)
251 ; AVX512DQ-NEXT: vpextrd $2, %xmm5, (%r8)
252 ; AVX512DQ-NEXT: vmovd %xmm7, (%r9)
253 ; AVX512DQ-NEXT: vmovd %xmm3, (%r10)
254 ; AVX512DQ-NEXT: vmovd %xmm0, (%rax)
255 ; AVX512DQ-NEXT: retq
257 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf2:
258 ; AVX512DQ-FCP: # %bb.0:
259 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
260 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
261 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
262 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
263 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
264 ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm0, %xmm3
265 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
266 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
267 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
268 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
269 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
270 ; AVX512DQ-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm8
271 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm1, %xmm9
272 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
273 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
274 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
275 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
276 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi)
277 ; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rdx)
278 ; AVX512DQ-FCP-NEXT: vmovd %xmm7, (%rcx)
279 ; AVX512DQ-FCP-NEXT: vmovd %xmm5, (%r8)
280 ; AVX512DQ-FCP-NEXT: vmovd %xmm8, (%r9)
281 ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%r10)
282 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%rax)
283 ; AVX512DQ-FCP-NEXT: retq
285 ; AVX512BW-LABEL: load_i16_stride7_vf2:
287 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
288 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
289 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
290 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
291 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
292 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
293 ; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm3
294 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
295 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
296 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
297 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
298 ; AVX512BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
299 ; AVX512BW-NEXT: vpbroadcastw 8(%rdi), %xmm7
300 ; AVX512BW-NEXT: vpsrlq $48, %xmm1, %xmm8
301 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
302 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
303 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
304 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
305 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
306 ; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
307 ; AVX512BW-NEXT: vmovd %xmm4, (%rdx)
308 ; AVX512BW-NEXT: vmovd %xmm6, (%rcx)
309 ; AVX512BW-NEXT: vpextrd $2, %xmm5, (%r8)
310 ; AVX512BW-NEXT: vmovd %xmm7, (%r9)
311 ; AVX512BW-NEXT: vmovd %xmm3, (%r10)
312 ; AVX512BW-NEXT: vmovd %xmm0, (%rax)
313 ; AVX512BW-NEXT: retq
315 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf2:
316 ; AVX512BW-FCP: # %bb.0:
317 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
318 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
319 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
320 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
321 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
322 ; AVX512BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3
323 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
324 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
325 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
326 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
327 ; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7
328 ; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8
329 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
330 ; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
331 ; AVX512BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
332 ; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
333 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi)
334 ; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rdx)
335 ; AVX512BW-FCP-NEXT: vmovd %xmm6, (%rcx)
336 ; AVX512BW-FCP-NEXT: vmovd %xmm5, (%r8)
337 ; AVX512BW-FCP-NEXT: vmovd %xmm7, (%r9)
338 ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%r10)
339 ; AVX512BW-FCP-NEXT: vmovd %xmm8, (%rax)
340 ; AVX512BW-FCP-NEXT: retq
342 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf2:
343 ; AVX512DQ-BW: # %bb.0:
344 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
345 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
346 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
347 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
348 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
349 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
350 ; AVX512DQ-BW-NEXT: vpsrld $16, %xmm0, %xmm3
351 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
352 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
353 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
354 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
355 ; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
356 ; AVX512DQ-BW-NEXT: vpbroadcastw 8(%rdi), %xmm7
357 ; AVX512DQ-BW-NEXT: vpsrlq $48, %xmm1, %xmm8
358 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
359 ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
360 ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
361 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
362 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
363 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi)
364 ; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rdx)
365 ; AVX512DQ-BW-NEXT: vmovd %xmm6, (%rcx)
366 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm5, (%r8)
367 ; AVX512DQ-BW-NEXT: vmovd %xmm7, (%r9)
368 ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%r10)
369 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%rax)
370 ; AVX512DQ-BW-NEXT: retq
372 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf2:
373 ; AVX512DQ-BW-FCP: # %bb.0:
374 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
375 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
376 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
377 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
378 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
379 ; AVX512DQ-BW-FCP-NEXT: vpsrld $16, %xmm0, %xmm3
380 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
381 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
382 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8,9,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
383 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
384 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm7
385 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm1, %xmm8
386 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
387 ; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
388 ; AVX512DQ-BW-FCP-NEXT: vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
389 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm8
390 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi)
391 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rdx)
392 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm6, (%rcx)
393 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm5, (%r8)
394 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm7, (%r9)
395 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%r10)
396 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm8, (%rax)
397 ; AVX512DQ-BW-FCP-NEXT: retq
398 %wide.vec = load <14 x i16>, ptr %in.vec, align 64
399 %strided.vec0 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 0, i32 7>
400 %strided.vec1 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 1, i32 8>
401 %strided.vec2 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 2, i32 9>
402 %strided.vec3 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 3, i32 10>
403 %strided.vec4 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 4, i32 11>
404 %strided.vec5 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 5, i32 12>
405 %strided.vec6 = shufflevector <14 x i16> %wide.vec, <14 x i16> poison, <2 x i32> <i32 6, i32 13>
406 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
407 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
408 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
409 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
410 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
411 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
412 store <2 x i16> %strided.vec6, ptr %out.vec6, align 64
416 define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
417 ; SSE-LABEL: load_i16_stride7_vf4:
419 ; SSE-NEXT: movdqa (%rdi), %xmm1
420 ; SSE-NEXT: movdqa 16(%rdi), %xmm4
421 ; SSE-NEXT: movdqa 32(%rdi), %xmm3
422 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
423 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
424 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,65535,65535,65535,65535]
425 ; SSE-NEXT: movdqa %xmm0, %xmm5
426 ; SSE-NEXT: pandn %xmm2, %xmm5
427 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,2,3,3]
428 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,3]
429 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
430 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
431 ; SSE-NEXT: pand %xmm0, %xmm2
432 ; SSE-NEXT: por %xmm5, %xmm2
433 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,65535,65535,65535,65535]
434 ; SSE-NEXT: movdqa %xmm4, %xmm7
435 ; SSE-NEXT: pand %xmm5, %xmm7
436 ; SSE-NEXT: pandn %xmm1, %xmm5
437 ; SSE-NEXT: por %xmm7, %xmm5
438 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
439 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm5[1,0,3,3,4,5,6,7]
440 ; SSE-NEXT: pand %xmm0, %xmm7
441 ; SSE-NEXT: movdqa %xmm3, %xmm5
442 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
443 ; SSE-NEXT: pandn %xmm5, %xmm0
444 ; SSE-NEXT: por %xmm7, %xmm0
445 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,3,2,3]
446 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm7[0,3,2,3,4,5,6,7]
447 ; SSE-NEXT: movdqa %xmm1, %xmm8
448 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
449 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,1,2,3]
450 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
451 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
452 ; SSE-NEXT: movdqa %xmm3, %xmm10
453 ; SSE-NEXT: movdqa %xmm3, %xmm9
454 ; SSE-NEXT: psrlq $16, %xmm9
455 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
456 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
457 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm6[1,1,1,1]
458 ; SSE-NEXT: pslld $16, %xmm6
459 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
460 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,1,1,1]
461 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
462 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
463 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
464 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3]
465 ; SSE-NEXT: movdqa %xmm1, %xmm10
466 ; SSE-NEXT: psrld $16, %xmm10
467 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
468 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
469 ; SSE-NEXT: psrlq $48, %xmm4
470 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
471 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
472 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
473 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,1,1,1]
474 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
475 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
476 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
477 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
478 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
479 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
480 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
481 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
482 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
483 ; SSE-NEXT: movq %xmm2, (%rsi)
484 ; SSE-NEXT: movq %xmm0, (%rdx)
485 ; SSE-NEXT: movq %xmm7, (%rcx)
486 ; SSE-NEXT: movq %xmm8, (%r8)
487 ; SSE-NEXT: movq %xmm6, (%r9)
488 ; SSE-NEXT: movq %xmm10, (%rdi)
489 ; SSE-NEXT: movq %xmm1, (%rax)
492 ; AVX-LABEL: load_i16_stride7_vf4:
494 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
495 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
496 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
497 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
498 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
499 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
500 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,3,3]
501 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
502 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
503 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
504 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[2,2,3,3]
505 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3],xmm3[4,5,6,7]
506 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
507 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
508 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,3,3,4,5,6,7]
509 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
510 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3],xmm5[4,5,6,7]
511 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,3,2,3]
512 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
513 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
514 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[2,1,2,3]
515 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
516 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
517 ; AVX-NEXT: vpslld $16, %xmm2, %xmm9
518 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
519 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,6,7]
520 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,2]
521 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6,7]
522 ; AVX-NEXT: vpsrlq $16, %xmm4, %xmm9
523 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
524 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm10
525 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
526 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
527 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
528 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm10
529 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
530 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
531 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
532 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
533 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3],xmm10[4,5,6,7]
534 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
535 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
536 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
537 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
538 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
539 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
540 ; AVX-NEXT: vmovq %xmm3, (%rsi)
541 ; AVX-NEXT: vmovq %xmm5, (%rdx)
542 ; AVX-NEXT: vmovq %xmm7, (%rcx)
543 ; AVX-NEXT: vmovq %xmm8, (%r8)
544 ; AVX-NEXT: vmovq %xmm9, (%r9)
545 ; AVX-NEXT: vmovq %xmm4, (%r10)
546 ; AVX-NEXT: vmovq %xmm0, (%rax)
549 ; AVX2-LABEL: load_i16_stride7_vf4:
551 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
552 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
553 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
554 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
555 ; AVX2-NEXT: vmovdqa (%rdi), %xmm3
556 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4
557 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5
558 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
559 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
560 ; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
561 ; AVX2-NEXT: vmovdqa (%rdi), %xmm6
562 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
563 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
564 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
565 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
566 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
567 ; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
568 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
569 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
570 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
571 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
572 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
573 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
574 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
575 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
576 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
577 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
578 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
579 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
580 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
581 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
582 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
583 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
584 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
585 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
586 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
587 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
588 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
589 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
590 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
591 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
592 ; AVX2-NEXT: vmovq %xmm1, (%rsi)
593 ; AVX2-NEXT: vmovq %xmm6, (%rdx)
594 ; AVX2-NEXT: vmovq %xmm3, (%rcx)
595 ; AVX2-NEXT: vmovq %xmm4, (%r8)
596 ; AVX2-NEXT: vmovq %xmm5, (%r9)
597 ; AVX2-NEXT: vmovq %xmm7, (%r10)
598 ; AVX2-NEXT: vmovq %xmm0, (%rax)
599 ; AVX2-NEXT: vzeroupper
602 ; AVX2-FP-LABEL: load_i16_stride7_vf4:
604 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
605 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
606 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
607 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
608 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
609 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
610 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
611 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
612 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
613 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
614 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm6
615 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
616 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
617 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
618 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
619 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
620 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
621 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
622 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
623 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
624 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
625 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
626 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
627 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm8
628 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4
629 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
630 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
631 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
632 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
633 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
634 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
635 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
636 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
637 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
638 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
639 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
640 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
641 ; AVX2-FP-NEXT: vmovq %xmm5, (%rsi)
642 ; AVX2-FP-NEXT: vmovq %xmm6, (%rdx)
643 ; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
644 ; AVX2-FP-NEXT: vmovq %xmm3, (%r8)
645 ; AVX2-FP-NEXT: vmovq %xmm4, (%r9)
646 ; AVX2-FP-NEXT: vmovq %xmm7, (%r10)
647 ; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
648 ; AVX2-FP-NEXT: vzeroupper
651 ; AVX2-FCP-LABEL: load_i16_stride7_vf4:
653 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
654 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
655 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
656 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
657 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
658 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
659 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
660 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1],xmm4[2],xmm2[3]
661 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm3[6],xmm5[7]
662 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
663 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
665 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3,4,5,6],xmm3[7]
666 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
667 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
668 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
669 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
670 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
671 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
672 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
673 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
674 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
675 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
676 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm8
677 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
678 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
679 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
680 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
681 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
682 ; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
683 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
684 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
686 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
687 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
688 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
689 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
690 ; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi)
691 ; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx)
692 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
693 ; AVX2-FCP-NEXT: vmovq %xmm3, (%r8)
694 ; AVX2-FCP-NEXT: vmovq %xmm4, (%r9)
695 ; AVX2-FCP-NEXT: vmovq %xmm7, (%r10)
696 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
697 ; AVX2-FCP-NEXT: vzeroupper
698 ; AVX2-FCP-NEXT: retq
700 ; AVX512-LABEL: load_i16_stride7_vf4:
702 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
703 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
704 ; AVX512-NEXT: vmovdqa (%rdi), %xmm2
705 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
706 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
707 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
708 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
709 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
710 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
711 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
712 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
713 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
714 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
715 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
716 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
717 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
718 ; AVX512-NEXT: vmovdqa (%rdi), %ymm3
719 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
720 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
721 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
722 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
723 ; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
724 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
725 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
726 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
727 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
728 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
729 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
730 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
731 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
732 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
733 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
734 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
735 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
736 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
737 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
738 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
739 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
740 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
741 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
742 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
743 ; AVX512-NEXT: vmovq %xmm0, (%rsi)
744 ; AVX512-NEXT: vmovq %xmm1, (%rdx)
745 ; AVX512-NEXT: vmovq %xmm2, (%rcx)
746 ; AVX512-NEXT: vmovq %xmm5, (%r8)
747 ; AVX512-NEXT: vmovq %xmm6, (%r9)
748 ; AVX512-NEXT: vmovq %xmm7, (%r10)
749 ; AVX512-NEXT: vmovq %xmm3, (%rax)
750 ; AVX512-NEXT: vzeroupper
753 ; AVX512-FCP-LABEL: load_i16_stride7_vf4:
754 ; AVX512-FCP: # %bb.0:
755 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
756 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
757 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
758 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
759 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
760 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
761 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
762 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
763 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
764 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
765 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
766 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
767 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
768 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
769 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
770 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
771 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
772 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
773 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
774 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
775 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
776 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
777 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
778 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8
779 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
780 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
781 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
782 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
783 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
784 ; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
785 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
786 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
787 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
788 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
789 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
790 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
791 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
792 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi)
793 ; AVX512-FCP-NEXT: vmovq %xmm4, (%rdx)
794 ; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx)
795 ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
796 ; AVX512-FCP-NEXT: vmovq %xmm6, (%r9)
797 ; AVX512-FCP-NEXT: vmovq %xmm7, (%r10)
798 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rax)
799 ; AVX512-FCP-NEXT: vzeroupper
800 ; AVX512-FCP-NEXT: retq
802 ; AVX512DQ-LABEL: load_i16_stride7_vf4:
804 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
805 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
806 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
807 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
808 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
809 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
810 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
811 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
812 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
813 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
814 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
815 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
816 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
817 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
818 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
819 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
820 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
821 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4
822 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
823 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
824 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
825 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
826 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
827 ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
828 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
829 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
830 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
831 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
832 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
833 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
834 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
835 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
836 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
837 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
838 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
839 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
840 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
841 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3
842 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
843 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
844 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
845 ; AVX512DQ-NEXT: vmovq %xmm0, (%rsi)
846 ; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
847 ; AVX512DQ-NEXT: vmovq %xmm2, (%rcx)
848 ; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
849 ; AVX512DQ-NEXT: vmovq %xmm6, (%r9)
850 ; AVX512DQ-NEXT: vmovq %xmm7, (%r10)
851 ; AVX512DQ-NEXT: vmovq %xmm3, (%rax)
852 ; AVX512DQ-NEXT: vzeroupper
853 ; AVX512DQ-NEXT: retq
855 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf4:
856 ; AVX512DQ-FCP: # %bb.0:
857 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
858 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
859 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
860 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
861 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
862 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3]
863 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
864 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
865 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
866 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
867 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6],xmm2[7]
868 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,0,1,14,15,12,13,u,u,u,u,u,u,u,u]
869 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
870 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
871 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
872 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
873 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
874 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
875 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
876 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
877 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
878 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
879 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
880 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8
881 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
882 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
883 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
884 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
885 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
886 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
887 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
888 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
889 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
890 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
891 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
892 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
893 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
894 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi)
895 ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rdx)
896 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx)
897 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
898 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r9)
899 ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r10)
900 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax)
901 ; AVX512DQ-FCP-NEXT: vzeroupper
902 ; AVX512DQ-FCP-NEXT: retq
904 ; AVX512BW-LABEL: load_i16_stride7_vf4:
906 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
907 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
908 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
909 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
910 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
911 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
912 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
913 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
914 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
915 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
916 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
917 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
918 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
919 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
920 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
921 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
922 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
923 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
924 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
925 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
926 ; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
927 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
928 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
929 ; AVX512BW-NEXT: vmovq %xmm7, (%r10)
930 ; AVX512BW-NEXT: vmovq %xmm8, (%rax)
931 ; AVX512BW-NEXT: vzeroupper
932 ; AVX512BW-NEXT: retq
934 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf4:
935 ; AVX512BW-FCP: # %bb.0:
936 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
937 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
938 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
939 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
940 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
941 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
942 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
943 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
944 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
945 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
946 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
947 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
948 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
949 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
950 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
951 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
952 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
953 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
954 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
955 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
956 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
957 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
958 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9)
959 ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
960 ; AVX512BW-FCP-NEXT: vmovq %xmm8, (%rax)
961 ; AVX512BW-FCP-NEXT: vzeroupper
962 ; AVX512BW-FCP-NEXT: retq
964 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf4:
965 ; AVX512DQ-BW: # %bb.0:
966 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
967 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
968 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
969 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
970 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
971 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
972 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
973 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
974 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
975 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
976 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
977 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
978 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
979 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
980 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
981 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
982 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
983 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
984 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
985 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
986 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
987 ; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8)
988 ; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9)
989 ; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10)
990 ; AVX512DQ-BW-NEXT: vmovq %xmm8, (%rax)
991 ; AVX512DQ-BW-NEXT: vzeroupper
992 ; AVX512DQ-BW-NEXT: retq
994 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf4:
995 ; AVX512DQ-BW-FCP: # %bb.0:
996 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
997 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
998 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
999 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
1000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
1001 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
1002 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
1003 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
1004 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
1005 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
1006 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
1007 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
1008 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
1009 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
1010 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
1011 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
1012 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
1013 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
1014 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
1015 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
1016 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
1017 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
1018 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9)
1019 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
1020 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%rax)
1021 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1022 ; AVX512DQ-BW-FCP-NEXT: retq
1023 %wide.vec = load <28 x i16>, ptr %in.vec, align 64
1024 %strided.vec0 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
1025 %strided.vec1 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
1026 %strided.vec2 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
1027 %strided.vec3 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
1028 %strided.vec4 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
1029 %strided.vec5 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
1030 %strided.vec6 = shufflevector <28 x i16> %wide.vec, <28 x i16> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
1031 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
1032 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
1033 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
1034 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
1035 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
1036 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
1037 store <4 x i16> %strided.vec6, ptr %out.vec6, align 64
1041 define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1042 ; SSE-LABEL: load_i16_stride7_vf8:
1044 ; SSE-NEXT: movdqa (%rdi), %xmm0
1045 ; SSE-NEXT: movdqa 16(%rdi), %xmm9
1046 ; SSE-NEXT: movaps 32(%rdi), %xmm2
1047 ; SSE-NEXT: movaps 48(%rdi), %xmm8
1048 ; SSE-NEXT: movdqa 80(%rdi), %xmm7
1049 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
1050 ; SSE-NEXT: movdqa 96(%rdi), %xmm6
1051 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
1052 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
1053 ; SSE-NEXT: movdqa %xmm11, %xmm4
1054 ; SSE-NEXT: pandn %xmm3, %xmm4
1055 ; SSE-NEXT: movdqa %xmm1, %xmm5
1056 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
1057 ; SSE-NEXT: pand %xmm11, %xmm5
1058 ; SSE-NEXT: por %xmm4, %xmm5
1059 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
1060 ; SSE-NEXT: movdqa %xmm3, %xmm10
1061 ; SSE-NEXT: pandn %xmm5, %xmm10
1062 ; SSE-NEXT: movaps %xmm2, %xmm5
1063 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,2],xmm8[2,2]
1064 ; SSE-NEXT: movaps {{.*#+}} xmm4 = [65535,65535,65535,0,0,65535,65535,65535]
1065 ; SSE-NEXT: movaps %xmm4, %xmm12
1066 ; SSE-NEXT: andnps %xmm5, %xmm12
1067 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
1068 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
1069 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
1070 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm13[2],xmm5[3],xmm13[3]
1071 ; SSE-NEXT: pand %xmm4, %xmm5
1072 ; SSE-NEXT: por %xmm12, %xmm5
1073 ; SSE-NEXT: pand %xmm3, %xmm5
1074 ; SSE-NEXT: por %xmm10, %xmm5
1075 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
1076 ; SSE-NEXT: movdqa %xmm6, %xmm10
1077 ; SSE-NEXT: pslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm10[0,1,2,3,4,5]
1078 ; SSE-NEXT: movdqa %xmm11, %xmm12
1079 ; SSE-NEXT: pandn %xmm10, %xmm12
1080 ; SSE-NEXT: movdqa %xmm7, %xmm10
1081 ; SSE-NEXT: psrld $16, %xmm10
1082 ; SSE-NEXT: movdqa %xmm1, %xmm15
1083 ; SSE-NEXT: psrldq {{.*#+}} xmm15 = xmm15[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1084 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm10[0],xmm15[1],xmm10[1]
1085 ; SSE-NEXT: pand %xmm11, %xmm15
1086 ; SSE-NEXT: por %xmm12, %xmm15
1087 ; SSE-NEXT: movdqa %xmm3, %xmm13
1088 ; SSE-NEXT: pandn %xmm15, %xmm13
1089 ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,65535,65535,65535,65535]
1090 ; SSE-NEXT: movdqa %xmm10, %xmm12
1091 ; SSE-NEXT: pandn %xmm0, %xmm12
1092 ; SSE-NEXT: movdqa %xmm9, %xmm15
1093 ; SSE-NEXT: pand %xmm10, %xmm15
1094 ; SSE-NEXT: por %xmm12, %xmm15
1095 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm15[0,3,2,3]
1096 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[1,0,3,3,4,5,6,7]
1097 ; SSE-NEXT: pand %xmm4, %xmm12
1098 ; SSE-NEXT: movaps %xmm2, %xmm15
1099 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
1100 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[2,1,2,1]
1101 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,0,0,0,4,5,6,7]
1102 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
1103 ; SSE-NEXT: pandn %xmm15, %xmm4
1104 ; SSE-NEXT: movdqa %xmm1, %xmm15
1105 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
1106 ; SSE-NEXT: por %xmm12, %xmm4
1107 ; SSE-NEXT: movdqa %xmm0, %xmm12
1108 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1109 ; SSE-NEXT: pand %xmm3, %xmm4
1110 ; SSE-NEXT: por %xmm13, %xmm4
1111 ; SSE-NEXT: movdqa %xmm1, %xmm13
1112 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm7[4],xmm13[5],xmm7[5],xmm13[6],xmm7[6],xmm13[7],xmm7[7]
1113 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
1114 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,7,7]
1115 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
1116 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[0,1,0,1]
1117 ; SSE-NEXT: pand %xmm11, %xmm15
1118 ; SSE-NEXT: pandn %xmm14, %xmm11
1119 ; SSE-NEXT: por %xmm15, %xmm11
1120 ; SSE-NEXT: movdqa %xmm3, %xmm14
1121 ; SSE-NEXT: pandn %xmm11, %xmm14
1122 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm8[2,2,3,3]
1123 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,1,0,3]
1124 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
1125 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm15[1]
1126 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,1,2,3]
1127 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
1128 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm15[0],xmm11[1,2,3]
1129 ; SSE-NEXT: andps %xmm3, %xmm11
1130 ; SSE-NEXT: orps %xmm14, %xmm11
1131 ; SSE-NEXT: movdqa %xmm10, %xmm14
1132 ; SSE-NEXT: pandn %xmm2, %xmm14
1133 ; SSE-NEXT: movdqa %xmm8, %xmm15
1134 ; SSE-NEXT: pand %xmm10, %xmm15
1135 ; SSE-NEXT: por %xmm14, %xmm15
1136 ; SSE-NEXT: movdqa %xmm0, %xmm14
1137 ; SSE-NEXT: psrld $16, %xmm14
1138 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
1139 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm15[0,1,1,0,4,5,6,7]
1140 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,7,7,7]
1141 ; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7]
1142 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm12[2,2,2,2]
1143 ; SSE-NEXT: movss {{.*#+}} xmm15 = xmm12[0],xmm15[1,2,3]
1144 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm0[2,3,2,3]
1145 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1146 ; SSE-NEXT: andps %xmm3, %xmm15
1147 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
1148 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
1149 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,4,7]
1150 ; SSE-NEXT: pandn %xmm13, %xmm3
1151 ; SSE-NEXT: movdqa %xmm2, %xmm13
1152 ; SSE-NEXT: psrlq $16, %xmm13
1153 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
1154 ; SSE-NEXT: por %xmm15, %xmm3
1155 ; SSE-NEXT: movdqa %xmm7, %xmm15
1156 ; SSE-NEXT: pand %xmm10, %xmm15
1157 ; SSE-NEXT: pandn %xmm1, %xmm10
1158 ; SSE-NEXT: por %xmm15, %xmm10
1159 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3]
1160 ; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7]
1161 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
1162 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
1163 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
1164 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,4,7,7]
1165 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
1166 ; SSE-NEXT: psrlq $48, %xmm9
1167 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1168 ; SSE-NEXT: movdqa %xmm2, %xmm9
1169 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
1170 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,1,1]
1171 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
1172 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
1173 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[1,1,1,1]
1174 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1]
1175 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1176 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1177 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[3,1,2,0]
1178 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,6,7]
1179 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
1180 ; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,4,7]
1181 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm12[0],xmm13[1]
1182 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
1183 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
1184 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[2,3,2,3]
1185 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
1186 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm10[0,2]
1187 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1188 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
1189 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
1190 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1191 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1192 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1193 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3]
1194 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
1195 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,4,7]
1196 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
1197 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1198 ; SSE-NEXT: movdqa %xmm5, (%rsi)
1199 ; SSE-NEXT: movdqa %xmm4, (%rdx)
1200 ; SSE-NEXT: movaps %xmm11, (%rcx)
1201 ; SSE-NEXT: movdqa %xmm3, (%r8)
1202 ; SSE-NEXT: movapd %xmm13, (%r9)
1203 ; SSE-NEXT: movaps %xmm14, (%rdi)
1204 ; SSE-NEXT: movapd %xmm1, (%rax)
1207 ; AVX-LABEL: load_i16_stride7_vf8:
1209 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1210 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
1211 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm2
1212 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm3
1213 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1214 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm4
1215 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
1216 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,5,6],xmm1[7]
1217 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
1218 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
1219 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5
1220 ; AVX-NEXT: vmovaps 48(%rdi), %xmm7
1221 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,2,3,3]
1222 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,1,0,3]
1223 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
1224 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
1225 ; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm5[2],xmm7[2],zero
1226 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
1227 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1228 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm8
1229 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
1230 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1231 ; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5]
1232 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6],xmm9[7]
1233 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
1234 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,8,9,6,7,u,u,u,u,u,u]
1235 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1236 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,3]
1237 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,0,3,3,4,5,6,7]
1238 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7]
1239 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4],xmm8[5,6,7]
1240 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1241 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7]
1242 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1]
1243 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
1244 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[2,2,3,3]
1245 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,1,0,3]
1246 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,4,7]
1247 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm11[1],xmm10[1]
1248 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1249 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[2,1,2,3]
1250 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
1251 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3,4,5,6,7]
1252 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1253 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,6,5,6,7]
1254 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,2,2,2]
1255 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm5[1],xmm7[2,3,4,5,6,7]
1256 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,0,4,5,6,7]
1257 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
1258 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3,4,5,6,7]
1259 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1260 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
1261 ; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,7,7]
1262 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
1263 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
1264 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
1265 ; AVX-NEXT: vpsrlq $16, %xmm5, %xmm11
1266 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1267 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm12
1268 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
1269 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1270 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5,6,7]
1271 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,1,0,3]
1272 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
1273 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
1274 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
1275 ; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
1276 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4,5,6,7]
1277 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm12
1278 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm1[4],xmm12[5],xmm1[5],xmm12[6],xmm1[6],xmm12[7],xmm1[7]
1279 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1280 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,5,6,7]
1281 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,2,2,2]
1282 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
1283 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
1284 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,3]
1285 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,7,7]
1286 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
1287 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
1288 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,2]
1289 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5,6,7]
1290 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
1291 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1292 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
1293 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
1294 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
1295 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,1,1]
1296 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
1297 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1298 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1299 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
1300 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
1301 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
1302 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1303 ; AVX-NEXT: vmovdqa %xmm6, (%rsi)
1304 ; AVX-NEXT: vmovdqa %xmm8, (%rdx)
1305 ; AVX-NEXT: vmovdqa %xmm9, (%rcx)
1306 ; AVX-NEXT: vmovdqa %xmm10, (%r8)
1307 ; AVX-NEXT: vmovdqa %xmm11, (%r9)
1308 ; AVX-NEXT: vmovdqa %xmm12, (%r10)
1309 ; AVX-NEXT: vmovdqa %xmm0, (%rax)
1312 ; AVX2-LABEL: load_i16_stride7_vf8:
1314 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1315 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
1316 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
1317 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
1318 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm0
1319 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1
1320 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1321 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
1322 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1323 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1324 ; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1325 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1326 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
1327 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1328 ; AVX2-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1329 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1330 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7
1331 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
1332 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
1333 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
1334 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
1335 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
1336 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1337 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11
1338 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1339 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1340 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
1341 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
1342 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
1343 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
1344 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1345 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11
1346 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1347 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1348 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
1349 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1350 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1351 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1352 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1353 ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13
1354 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2],xmm12[3],xmm13[4,5,6,7]
1355 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,2,1,0,4,5,6,7]
1356 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
1357 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1358 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1359 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1360 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1361 ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13
1362 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
1363 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1364 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
1365 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1366 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1367 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1368 ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm13
1369 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
1370 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
1371 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
1372 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
1373 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1374 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
1375 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1376 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
1377 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,4,7,6]
1378 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1379 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1380 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,6,4,6,7]
1381 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3
1382 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
1383 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
1384 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1385 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1386 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1387 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1388 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1389 ; AVX2-NEXT: vmovdqa %xmm5, (%rsi)
1390 ; AVX2-NEXT: vmovdqa %xmm6, (%rdx)
1391 ; AVX2-NEXT: vmovdqa %xmm9, (%rcx)
1392 ; AVX2-NEXT: vmovdqa %xmm10, (%r8)
1393 ; AVX2-NEXT: vmovdqa %xmm11, (%r9)
1394 ; AVX2-NEXT: vmovdqa %xmm7, (%r10)
1395 ; AVX2-NEXT: vmovdqa %xmm0, (%rax)
1396 ; AVX2-NEXT: vzeroupper
1399 ; AVX2-FP-LABEL: load_i16_stride7_vf8:
1401 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1402 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1403 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
1404 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
1405 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm0
1406 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm1
1407 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1408 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2
1409 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1410 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1411 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1412 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1413 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
1414 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1415 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1416 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1417 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7
1418 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm9
1419 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
1420 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
1421 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1422 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
1423 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1424 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
1425 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
1426 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
1427 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1428 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
1429 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1430 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
1431 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
1432 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1433 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1434 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
1435 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
1436 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1437 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
1438 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
1439 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm13, %xmm12
1440 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1441 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1442 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1443 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1444 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1445 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1446 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm12
1447 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1448 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1449 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1450 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1451 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm14
1452 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1453 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1454 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1455 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
1456 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1457 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1458 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1459 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1460 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
1461 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1462 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1463 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1464 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1465 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1466 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1467 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1468 ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rsi)
1469 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx)
1470 ; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx)
1471 ; AVX2-FP-NEXT: vmovdqa %xmm10, (%r8)
1472 ; AVX2-FP-NEXT: vmovdqa %xmm11, (%r9)
1473 ; AVX2-FP-NEXT: vmovdqa %xmm7, (%r10)
1474 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rax)
1475 ; AVX2-FP-NEXT: vzeroupper
1476 ; AVX2-FP-NEXT: retq
1478 ; AVX2-FCP-LABEL: load_i16_stride7_vf8:
1479 ; AVX2-FCP: # %bb.0:
1480 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1481 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1482 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
1483 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
1484 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
1485 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
1486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0],xmm1[1,2,3]
1487 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
1488 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1489 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1490 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
1491 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
1492 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1493 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1494 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1495 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
1496 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
1497 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9
1498 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
1499 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
1500 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
1501 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
1502 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
1503 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
1504 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
1505 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
1506 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
1507 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
1508 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
1509 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
1510 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
1511 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
1512 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
1513 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
1514 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
1515 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
1516 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
1517 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
1518 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12
1519 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3,4],xmm10[5,6,7]
1520 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,6,7,0,1,14,15,u,u,10,11]
1521 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm2[6],xmm11[7]
1522 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
1523 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
1524 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
1525 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
1526 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
1527 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1528 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1529 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
1530 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
1531 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
1532 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1533 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1534 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
1535 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
1536 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
1538 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
1539 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1540 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1541 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
1542 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1543 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1544 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1545 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1546 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1547 ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsi)
1548 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx)
1549 ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx)
1550 ; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r8)
1551 ; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r9)
1552 ; AVX2-FCP-NEXT: vmovdqa %xmm7, (%r10)
1553 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rax)
1554 ; AVX2-FCP-NEXT: vzeroupper
1555 ; AVX2-FCP-NEXT: retq
1557 ; AVX512-LABEL: load_i16_stride7_vf8:
1559 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1560 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1561 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0
1562 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
1563 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1564 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
1565 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1566 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1567 ; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1568 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4
1569 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
1570 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1571 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
1572 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1573 ; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1574 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1575 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1576 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1577 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
1578 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
1579 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1580 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
1581 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1582 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1583 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1584 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1585 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1586 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
1587 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1588 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
1589 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1590 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1591 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1592 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1593 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1594 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1595 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1596 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
1597 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1598 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7]
1599 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1600 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1601 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1602 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1603 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1604 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
1605 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
1606 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1607 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
1608 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1609 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1610 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1611 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
1612 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
1613 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
1614 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1615 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
1616 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1617 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1618 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1619 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
1620 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6]
1621 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1622 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1623 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
1624 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
1625 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
1626 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
1627 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1628 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1629 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1630 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1631 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1632 ; AVX512-NEXT: vmovdqa %xmm3, (%rsi)
1633 ; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
1634 ; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
1635 ; AVX512-NEXT: vmovdqa %xmm8, (%r8)
1636 ; AVX512-NEXT: vmovdqa %xmm9, (%r9)
1637 ; AVX512-NEXT: vmovdqa %xmm10, (%r10)
1638 ; AVX512-NEXT: vmovdqa %xmm0, (%rax)
1639 ; AVX512-NEXT: vzeroupper
1642 ; AVX512-FCP-LABEL: load_i16_stride7_vf8:
1643 ; AVX512-FCP: # %bb.0:
1644 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1645 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1646 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
1647 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
1648 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1649 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
1650 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1651 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1652 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1653 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
1654 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
1655 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1656 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1657 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1658 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1659 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1660 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1661 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1662 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1663 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1664 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1665 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
1666 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
1667 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1668 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1669 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1670 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1671 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
1672 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
1673 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1674 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1675 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
1676 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
1677 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1678 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
1679 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
1680 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
1681 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1682 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1683 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1684 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1685 ; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
1686 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
1687 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
1688 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
1689 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1690 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1691 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1692 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
1693 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
1694 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1695 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1696 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1697 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1698 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1699 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1700 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1701 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1702 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1703 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
1704 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1705 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1706 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1707 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1708 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1709 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi)
1710 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx)
1711 ; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx)
1712 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8)
1713 ; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9)
1714 ; AVX512-FCP-NEXT: vmovdqa %xmm10, (%r10)
1715 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
1716 ; AVX512-FCP-NEXT: vzeroupper
1717 ; AVX512-FCP-NEXT: retq
1719 ; AVX512DQ-LABEL: load_i16_stride7_vf8:
1720 ; AVX512DQ: # %bb.0:
1721 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1722 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1723 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0
1724 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
1725 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1726 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2
1727 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1728 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1729 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1730 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4
1731 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
1732 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1733 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
1734 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1735 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1736 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1737 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1738 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1739 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
1740 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
1741 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1742 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
1743 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1744 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,u,u,u,u,u,u]
1745 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2,3,4],xmm6[5,6,7]
1746 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1747 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
1748 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
1749 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1750 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
1751 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1752 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,u,u,u,u,u,u]
1753 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3,4],xmm7[5,6,7]
1754 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1755 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1756 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,8,9,6,7,0,1]
1757 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1758 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
1759 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
1760 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,2,1,0,4,5,6,7]
1761 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
1762 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1763 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1764 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1765 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1766 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
1767 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
1768 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1769 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
1770 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1771 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1772 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1773 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
1774 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
1775 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
1776 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
1777 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
1778 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1779 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1780 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1781 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,0,3]
1782 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,4,7,6]
1783 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1784 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1785 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
1786 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
1787 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
1788 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
1789 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
1790 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1791 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1792 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1793 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1794 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi)
1795 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx)
1796 ; AVX512DQ-NEXT: vmovdqa %xmm7, (%rcx)
1797 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%r8)
1798 ; AVX512DQ-NEXT: vmovdqa %xmm9, (%r9)
1799 ; AVX512DQ-NEXT: vmovdqa %xmm10, (%r10)
1800 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rax)
1801 ; AVX512DQ-NEXT: vzeroupper
1802 ; AVX512DQ-NEXT: retq
1804 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf8:
1805 ; AVX512DQ-FCP: # %bb.0:
1806 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1807 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1808 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
1809 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
1810 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
1811 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
1812 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1813 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
1814 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
1815 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
1816 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
1817 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
1818 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
1819 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
1820 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
1821 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
1822 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
1823 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
1824 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
1825 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1826 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
1827 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
1828 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
1829 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
1830 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
1831 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
1832 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
1833 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
1834 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
1835 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
1836 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
1837 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
1838 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
1839 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
1840 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
1841 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
1842 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
1843 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
1844 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
1845 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
1846 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
1847 ; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
1848 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
1849 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
1850 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
1851 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
1852 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1853 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
1854 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
1855 ; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
1856 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
1857 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1858 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
1859 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
1860 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
1861 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
1862 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
1863 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
1864 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
1865 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
1866 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
1867 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
1868 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
1869 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
1870 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
1871 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi)
1872 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx)
1873 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx)
1874 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8)
1875 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9)
1876 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%r10)
1877 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
1878 ; AVX512DQ-FCP-NEXT: vzeroupper
1879 ; AVX512DQ-FCP-NEXT: retq
1881 ; AVX512BW-LABEL: load_i16_stride7_vf8:
1882 ; AVX512BW: # %bb.0:
1883 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1884 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1885 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1886 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1887 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1888 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1889 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1890 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1891 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1892 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1893 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1894 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1895 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1896 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1897 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1898 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1899 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1900 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1901 ; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi)
1902 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
1903 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
1904 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
1905 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1906 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r10)
1907 ; AVX512BW-NEXT: vmovdqa %xmm8, (%rax)
1908 ; AVX512BW-NEXT: vzeroupper
1909 ; AVX512BW-NEXT: retq
1911 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf8:
1912 ; AVX512BW-FCP: # %bb.0:
1913 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1914 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1915 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1916 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1917 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1918 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1919 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1920 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1921 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1922 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1923 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1924 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1925 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1926 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1927 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1928 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1929 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1930 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1931 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
1932 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1933 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1934 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1935 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1936 ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r10)
1937 ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%rax)
1938 ; AVX512BW-FCP-NEXT: vzeroupper
1939 ; AVX512BW-FCP-NEXT: retq
1941 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf8:
1942 ; AVX512DQ-BW: # %bb.0:
1943 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1944 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1945 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1946 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1947 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1948 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1949 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1950 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1951 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1952 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1953 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1954 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1955 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1956 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1957 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1958 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1959 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1960 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1961 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi)
1962 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
1963 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
1964 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
1965 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
1966 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r10)
1967 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%rax)
1968 ; AVX512DQ-BW-NEXT: vzeroupper
1969 ; AVX512DQ-BW-NEXT: retq
1971 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf8:
1972 ; AVX512DQ-BW-FCP: # %bb.0:
1973 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1974 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1975 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1976 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1977 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,7,14,21,28,35,42,49]
1978 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
1979 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,8,15,22,29,36,43,50]
1980 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3
1981 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,9,16,23,30,37,44,51]
1982 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
1983 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,10,17,24,31,38,45,52]
1984 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
1985 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,11,18,25,32,39,46,53]
1986 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
1987 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,12,19,26,33,40,47,54]
1988 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
1989 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,13,20,27,34,41,48,55]
1990 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
1991 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
1992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1996 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r10)
1997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%rax)
1998 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1999 ; AVX512DQ-BW-FCP-NEXT: retq
2000 %wide.vec = load <56 x i16>, ptr %in.vec, align 64
2001 %strided.vec0 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
2002 %strided.vec1 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
2003 %strided.vec2 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
2004 %strided.vec3 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
2005 %strided.vec4 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
2006 %strided.vec5 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
2007 %strided.vec6 = shufflevector <56 x i16> %wide.vec, <56 x i16> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
2008 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
2009 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
2010 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
2011 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
2012 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
2013 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
2014 store <8 x i16> %strided.vec6, ptr %out.vec6, align 64
2018 define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
2019 ; SSE-LABEL: load_i16_stride7_vf16:
2021 ; SSE-NEXT: subq $232, %rsp
2022 ; SSE-NEXT: movdqa 80(%rdi), %xmm11
2023 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2024 ; SSE-NEXT: movdqa 64(%rdi), %xmm9
2025 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2026 ; SSE-NEXT: movdqa 112(%rdi), %xmm12
2027 ; SSE-NEXT: movdqa 128(%rdi), %xmm6
2028 ; SSE-NEXT: movaps 160(%rdi), %xmm5
2029 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill
2030 ; SSE-NEXT: movaps 144(%rdi), %xmm7
2031 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2032 ; SSE-NEXT: movdqa 192(%rdi), %xmm13
2033 ; SSE-NEXT: movdqa 176(%rdi), %xmm15
2034 ; SSE-NEXT: movdqa 208(%rdi), %xmm14
2035 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
2036 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2037 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0]
2038 ; SSE-NEXT: movdqa %xmm1, %xmm2
2039 ; SSE-NEXT: pandn %xmm0, %xmm2
2040 ; SSE-NEXT: movdqa %xmm15, %xmm0
2041 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
2042 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2043 ; SSE-NEXT: pand %xmm1, %xmm0
2044 ; SSE-NEXT: por %xmm2, %xmm0
2045 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,0,0,0]
2046 ; SSE-NEXT: movdqa %xmm3, %xmm2
2047 ; SSE-NEXT: movdqa %xmm3, %xmm10
2048 ; SSE-NEXT: pandn %xmm0, %xmm2
2049 ; SSE-NEXT: movaps %xmm7, %xmm0
2050 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm5[2,2]
2051 ; SSE-NEXT: movaps {{.*#+}} xmm8 = [65535,65535,65535,0,0,65535,65535,65535]
2052 ; SSE-NEXT: movaps %xmm8, %xmm4
2053 ; SSE-NEXT: andnps %xmm0, %xmm4
2054 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,3,3]
2055 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2056 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,0,3]
2057 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2058 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,7,6,7]
2059 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2060 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
2061 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2062 ; SSE-NEXT: pand %xmm8, %xmm3
2063 ; SSE-NEXT: por %xmm4, %xmm3
2064 ; SSE-NEXT: pand %xmm10, %xmm3
2065 ; SSE-NEXT: por %xmm2, %xmm3
2066 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2067 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2068 ; SSE-NEXT: movdqa %xmm1, %xmm2
2069 ; SSE-NEXT: pandn %xmm0, %xmm2
2070 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
2071 ; SSE-NEXT: pand %xmm1, %xmm9
2072 ; SSE-NEXT: por %xmm2, %xmm9
2073 ; SSE-NEXT: movdqa %xmm10, %xmm2
2074 ; SSE-NEXT: pandn %xmm9, %xmm2
2075 ; SSE-NEXT: movaps 32(%rdi), %xmm0
2076 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2077 ; SSE-NEXT: movaps 48(%rdi), %xmm3
2078 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2079 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
2080 ; SSE-NEXT: movaps %xmm8, %xmm4
2081 ; SSE-NEXT: andnps %xmm0, %xmm4
2082 ; SSE-NEXT: movdqa (%rdi), %xmm0
2083 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
2085 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,7,6,7]
2086 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
2087 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2088 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
2089 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
2090 ; SSE-NEXT: pand %xmm8, %xmm3
2091 ; SSE-NEXT: por %xmm4, %xmm3
2092 ; SSE-NEXT: pand %xmm10, %xmm3
2093 ; SSE-NEXT: por %xmm2, %xmm3
2094 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2095 ; SSE-NEXT: pslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm14[0,1,2,3,4,5]
2096 ; SSE-NEXT: movdqa %xmm1, %xmm2
2097 ; SSE-NEXT: pandn %xmm14, %xmm2
2098 ; SSE-NEXT: psrld $16, %xmm13
2099 ; SSE-NEXT: movdqa %xmm15, %xmm4
2100 ; SSE-NEXT: movdqa %xmm15, %xmm11
2101 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2102 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2103 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
2104 ; SSE-NEXT: pand %xmm1, %xmm4
2105 ; SSE-NEXT: por %xmm2, %xmm4
2106 ; SSE-NEXT: movdqa %xmm10, %xmm0
2107 ; SSE-NEXT: movdqa %xmm10, %xmm14
2108 ; SSE-NEXT: pandn %xmm4, %xmm0
2109 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
2110 ; SSE-NEXT: movdqa %xmm7, %xmm4
2111 ; SSE-NEXT: pandn %xmm12, %xmm4
2112 ; SSE-NEXT: pand %xmm7, %xmm6
2113 ; SSE-NEXT: por %xmm4, %xmm6
2114 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2115 ; SSE-NEXT: movdqa %xmm9, %xmm4
2116 ; SSE-NEXT: movdqa (%rsp), %xmm13 # 16-byte Reload
2117 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
2118 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
2119 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
2120 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
2121 ; SSE-NEXT: movdqa %xmm8, %xmm2
2122 ; SSE-NEXT: pandn %xmm4, %xmm2
2123 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
2124 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
2125 ; SSE-NEXT: pand %xmm8, %xmm4
2126 ; SSE-NEXT: por %xmm4, %xmm2
2127 ; SSE-NEXT: pand %xmm10, %xmm2
2128 ; SSE-NEXT: por %xmm0, %xmm2
2129 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2130 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2131 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
2132 ; SSE-NEXT: movdqa %xmm1, %xmm4
2133 ; SSE-NEXT: pandn %xmm0, %xmm4
2134 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2135 ; SSE-NEXT: movdqa %xmm3, %xmm0
2136 ; SSE-NEXT: psrld $16, %xmm0
2137 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
2138 ; SSE-NEXT: movdqa %xmm15, %xmm5
2139 ; SSE-NEXT: psrldq {{.*#+}} xmm5 = xmm5[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2140 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2141 ; SSE-NEXT: pand %xmm1, %xmm5
2142 ; SSE-NEXT: por %xmm4, %xmm5
2143 ; SSE-NEXT: movdqa %xmm10, %xmm0
2144 ; SSE-NEXT: pandn %xmm5, %xmm0
2145 ; SSE-NEXT: movdqa %xmm7, %xmm4
2146 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2147 ; SSE-NEXT: pandn %xmm10, %xmm4
2148 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2149 ; SSE-NEXT: movdqa %xmm12, %xmm5
2150 ; SSE-NEXT: pand %xmm7, %xmm5
2151 ; SSE-NEXT: por %xmm4, %xmm5
2152 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
2153 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
2154 ; SSE-NEXT: pand %xmm8, %xmm4
2155 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2156 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2157 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
2158 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,1]
2159 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
2160 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
2161 ; SSE-NEXT: pandn %xmm5, %xmm8
2162 ; SSE-NEXT: por %xmm4, %xmm8
2163 ; SSE-NEXT: pand %xmm14, %xmm8
2164 ; SSE-NEXT: por %xmm0, %xmm8
2165 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2166 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2167 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,1]
2168 ; SSE-NEXT: movdqa %xmm1, %xmm4
2169 ; SSE-NEXT: pandn %xmm0, %xmm4
2170 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2171 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
2172 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,1]
2173 ; SSE-NEXT: pand %xmm1, %xmm0
2174 ; SSE-NEXT: por %xmm4, %xmm0
2175 ; SSE-NEXT: movdqa %xmm14, %xmm4
2176 ; SSE-NEXT: pandn %xmm0, %xmm4
2177 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,1,0,3]
2178 ; SSE-NEXT: pshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,4,7]
2179 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,3,3]
2180 ; SSE-NEXT: movdqa %xmm13, %xmm2
2181 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm11 = xmm11[1],xmm0[1]
2182 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2183 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2184 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
2185 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,1,2,3]
2186 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
2187 ; SSE-NEXT: movss {{.*#+}} xmm11 = xmm0[0],xmm11[1,2,3]
2188 ; SSE-NEXT: andps %xmm14, %xmm11
2189 ; SSE-NEXT: orps %xmm4, %xmm11
2190 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2191 ; SSE-NEXT: movdqa %xmm15, %xmm0
2192 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
2193 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
2194 ; SSE-NEXT: pand %xmm1, %xmm0
2195 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2196 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,1]
2197 ; SSE-NEXT: pandn %xmm4, %xmm1
2198 ; SSE-NEXT: por %xmm0, %xmm1
2199 ; SSE-NEXT: movdqa %xmm14, %xmm0
2200 ; SSE-NEXT: movaps %xmm14, %xmm15
2201 ; SSE-NEXT: pandn %xmm1, %xmm0
2202 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2203 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,1,0,3]
2204 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
2205 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2206 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,3,3]
2207 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
2208 ; SSE-NEXT: movdqa %xmm10, %xmm1
2209 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
2210 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
2211 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
2212 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm11[0],xmm4[1,2,3]
2213 ; SSE-NEXT: andps %xmm15, %xmm4
2214 ; SSE-NEXT: orps %xmm0, %xmm4
2215 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2216 ; SSE-NEXT: movdqa %xmm7, %xmm0
2217 ; SSE-NEXT: pandn %xmm9, %xmm0
2218 ; SSE-NEXT: pand %xmm7, %xmm2
2219 ; SSE-NEXT: por %xmm0, %xmm2
2220 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,1,1,0,4,5,6,7]
2221 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
2222 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
2223 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
2224 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3]
2225 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2226 ; SSE-NEXT: movdqa %xmm4, %xmm5
2227 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
2228 ; SSE-NEXT: movdqa %xmm6, %xmm12
2229 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
2230 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
2231 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
2232 ; SSE-NEXT: movdqa %xmm8, %xmm6
2233 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
2234 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
2235 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
2236 ; SSE-NEXT: movaps %xmm15, %xmm2
2237 ; SSE-NEXT: andnps %xmm5, %xmm2
2238 ; SSE-NEXT: andps %xmm15, %xmm0
2239 ; SSE-NEXT: orps %xmm0, %xmm2
2240 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2241 ; SSE-NEXT: movdqa %xmm7, %xmm0
2242 ; SSE-NEXT: pandn %xmm13, %xmm0
2243 ; SSE-NEXT: movdqa %xmm13, %xmm9
2244 ; SSE-NEXT: movdqa %xmm14, %xmm5
2245 ; SSE-NEXT: movdqa %xmm14, %xmm13
2246 ; SSE-NEXT: pand %xmm7, %xmm5
2247 ; SSE-NEXT: por %xmm0, %xmm5
2248 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,1,1,0,4,5,6,7]
2249 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
2250 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
2251 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2252 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
2253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2254 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2255 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2256 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2257 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
2258 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2259 ; SSE-NEXT: andps %xmm15, %xmm0
2260 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
2261 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
2262 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
2263 ; SSE-NEXT: andnps %xmm1, %xmm15
2264 ; SSE-NEXT: orps %xmm0, %xmm15
2265 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2266 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2267 ; SSE-NEXT: movdqa %xmm0, %xmm14
2268 ; SSE-NEXT: psrld $16, %xmm14
2269 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2270 ; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
2271 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2272 ; SSE-NEXT: movdqa %xmm0, %xmm5
2273 ; SSE-NEXT: movdqa %xmm2, %xmm0
2274 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
2275 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2276 ; SSE-NEXT: psrlq $48, %xmm0
2277 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2278 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2279 ; SSE-NEXT: movdqa %xmm8, %xmm0
2280 ; SSE-NEXT: psrlq $16, %xmm0
2281 ; SSE-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload
2282 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
2283 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
2284 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2285 ; SSE-NEXT: movdqa %xmm4, %xmm5
2286 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,1,0,3]
2287 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2288 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
2289 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2290 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
2291 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2292 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2293 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,4,7]
2294 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2295 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2296 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2297 ; SSE-NEXT: movdqa %xmm10, %xmm15
2298 ; SSE-NEXT: psrld $16, %xmm15
2299 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2300 ; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
2301 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,3,2,3]
2302 ; SSE-NEXT: movdqa %xmm2, %xmm1
2303 ; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
2304 ; SSE-NEXT: psrlq $48, %xmm1
2305 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2306 ; SSE-NEXT: movdqa %xmm9, %xmm4
2307 ; SSE-NEXT: movdqa %xmm9, %xmm1
2308 ; SSE-NEXT: psrlq $16, %xmm1
2309 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
2310 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
2311 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2312 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2313 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
2314 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
2315 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2316 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2317 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
2318 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
2319 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
2320 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2321 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
2322 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2323 ; SSE-NEXT: movdqa %xmm8, %xmm0
2324 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
2325 ; SSE-NEXT: movdqa %xmm11, %xmm8
2326 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2327 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2328 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
2329 ; SSE-NEXT: movdqa %xmm7, %xmm0
2330 ; SSE-NEXT: pandn %xmm5, %xmm0
2331 ; SSE-NEXT: movdqa %xmm12, %xmm11
2332 ; SSE-NEXT: pand %xmm7, %xmm11
2333 ; SSE-NEXT: por %xmm0, %xmm11
2334 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
2335 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
2336 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
2337 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2338 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm0[0,2]
2339 ; SSE-NEXT: pand %xmm7, %xmm2
2340 ; SSE-NEXT: movdqa %xmm9, %xmm11
2341 ; SSE-NEXT: pandn %xmm9, %xmm7
2342 ; SSE-NEXT: por %xmm2, %xmm7
2343 ; SSE-NEXT: movdqa %xmm4, %xmm0
2344 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
2345 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
2346 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2347 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2348 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
2349 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
2350 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
2351 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2352 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm0[0,2]
2353 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2354 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2355 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2356 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2357 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
2358 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2359 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
2360 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
2361 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2362 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[1,1,1,1,4,5,6,7]
2363 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2364 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
2365 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
2366 ; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
2367 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
2368 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
2369 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[1,1,1,1]
2370 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2371 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
2372 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,1,1]
2373 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm10[2,1,2,3]
2374 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
2375 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
2376 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2377 ; SSE-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7]
2378 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
2379 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
2380 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
2381 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
2382 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm2[2],xmm11[3],xmm2[3]
2383 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm9[0],xmm11[1]
2384 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2385 ; SSE-NEXT: movaps %xmm2, (%rsi)
2386 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2387 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
2388 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2389 ; SSE-NEXT: movaps %xmm2, (%rdx)
2390 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2391 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
2392 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2393 ; SSE-NEXT: movaps %xmm2, (%rcx)
2394 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2395 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
2396 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2397 ; SSE-NEXT: movaps %xmm2, (%r8)
2398 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2399 ; SSE-NEXT: movaps %xmm2, 16(%r8)
2400 ; SSE-NEXT: movapd %xmm1, (%r9)
2401 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2402 ; SSE-NEXT: movaps %xmm1, 16(%r9)
2403 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2404 ; SSE-NEXT: movaps %xmm15, (%rax)
2405 ; SSE-NEXT: movaps %xmm14, 16(%rax)
2406 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2407 ; SSE-NEXT: movapd %xmm11, (%rax)
2408 ; SSE-NEXT: movapd %xmm0, 16(%rax)
2409 ; SSE-NEXT: addq $232, %rsp
2412 ; AVX-LABEL: load_i16_stride7_vf16:
2414 ; AVX-NEXT: subq $264, %rsp # imm = 0x108
2415 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm0
2416 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2417 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
2418 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm12
2419 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
2420 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2421 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
2422 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2423 ; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
2424 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm2
2425 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2426 ; AVX-NEXT: vmovdqa %xmm2, %xmm8
2427 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2428 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm2
2429 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2430 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm1
2431 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2432 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
2433 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
2434 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
2435 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
2436 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm5
2437 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
2438 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2439 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
2440 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2441 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
2442 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
2443 ; AVX-NEXT: vmovdqa %xmm1, %xmm7
2444 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0,1,2,3,4,5,6],xmm10[7]
2445 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm6
2446 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,3,2,3]
2447 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
2448 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm11
2449 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
2450 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2451 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
2452 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[2,2,3,3]
2453 ; AVX-NEXT: vmovdqa %xmm9, %xmm10
2454 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
2455 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,7,6,7]
2456 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2457 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
2458 ; AVX-NEXT: vmovaps 48(%rdi), %xmm2
2459 ; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm1[2],xmm2[2],zero
2460 ; AVX-NEXT: vmovaps %xmm2, %xmm13
2461 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2462 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2463 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3,4],xmm15[5,6,7]
2464 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
2465 ; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11
2466 ; AVX-NEXT: vandps %ymm15, %ymm9, %ymm9
2467 ; AVX-NEXT: vorps %ymm11, %ymm9, %ymm9
2468 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2469 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7]
2470 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2471 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2472 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
2473 ; AVX-NEXT: vmovdqa %xmm12, %xmm4
2474 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
2475 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
2476 ; AVX-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
2477 ; AVX-NEXT: # xmm9 = mem[0,1,2,3,4,5],xmm8[6],mem[7]
2478 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
2479 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,0,3,2,4,5,6,7]
2480 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
2481 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2482 ; AVX-NEXT: vpslld $16, %xmm2, %xmm9
2483 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2484 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
2485 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
2486 ; AVX-NEXT: vpsrld $16, %xmm5, %xmm9
2487 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2488 ; AVX-NEXT: vpsrldq {{.*#+}} xmm11 = xmm14[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2489 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
2490 ; AVX-NEXT: vpslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm7[0,1,2,3,4,5]
2491 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2492 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm11[7]
2493 ; AVX-NEXT: vpsrld $16, %xmm6, %xmm11
2494 ; AVX-NEXT: vmovdqa %xmm6, %xmm5
2495 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2496 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm9, %ymm9
2497 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
2498 ; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
2499 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2500 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0],xmm1[1],xmm10[2,3,4,5,6,7]
2501 ; AVX-NEXT: vmovdqa %xmm10, %xmm13
2502 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2503 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,3]
2504 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,3,4,5,6,7]
2505 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4],xmm15[5,6,7]
2506 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
2507 ; AVX-NEXT: vandnps %ymm9, %ymm6, %ymm9
2508 ; AVX-NEXT: vandps %ymm6, %ymm15, %ymm15
2509 ; AVX-NEXT: vorps %ymm9, %ymm15, %ymm9
2510 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2511 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2512 ; AVX-NEXT: vandnps %ymm0, %ymm6, %ymm0
2513 ; AVX-NEXT: vandps %ymm6, %ymm9, %ymm9
2514 ; AVX-NEXT: vorps %ymm0, %ymm9, %ymm0
2515 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2516 ; AVX-NEXT: vpsllq $16, %xmm3, %xmm0
2517 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
2518 ; AVX-NEXT: vmovdqa %xmm4, %xmm15
2519 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2520 ; AVX-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
2521 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,3,2,3]
2522 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
2523 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2524 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6,7]
2525 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2,3],xmm0[4,5,6,7]
2526 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
2527 ; AVX-NEXT: vmovdqa %xmm12, %xmm11
2528 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
2529 ; AVX-NEXT: vmovdqa %xmm2, %xmm4
2530 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm9[6,7]
2531 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2532 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,2,2,2]
2533 ; AVX-NEXT: vmovdqa %xmm14, %xmm2
2534 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,5],xmm9[6,7]
2535 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,1]
2536 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6],xmm10[7]
2537 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[1,1,1,1]
2538 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2539 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2540 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,0,3]
2541 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
2542 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2543 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[2,2,3,3]
2544 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm10 = xmm10[1],xmm12[1]
2545 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
2546 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[2,1,2,3]
2547 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
2548 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3,4,5,6,7]
2549 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
2550 ; AVX-NEXT: vandnps %ymm9, %ymm5, %ymm9
2551 ; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
2552 ; AVX-NEXT: vorps %ymm1, %ymm9, %ymm1
2553 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2554 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2555 ; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0
2556 ; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
2557 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
2558 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2559 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
2560 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
2561 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
2562 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2563 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm5[6],xmm15[7]
2564 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
2565 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
2566 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3,4,5,6,7]
2567 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
2568 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,1,2,3,4,5,6,7]
2569 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,0,0,0]
2570 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm9[6,7]
2571 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,6,7]
2572 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
2573 ; AVX-NEXT: vmovdqa %xmm7, %xmm13
2574 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0],xmm7[1],xmm14[2,3,4,5,6,7]
2575 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,0,4,5,6,7]
2576 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
2577 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3,4,5,6,7]
2578 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
2579 ; AVX-NEXT: vmovdqa %xmm2, %xmm11
2580 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
2581 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,7,7]
2582 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
2583 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
2584 ; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,4,5,8,9,2,3]
2585 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2586 ; AVX-NEXT: vpsrlq $48, %xmm8, %xmm12
2587 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
2588 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
2589 ; AVX-NEXT: vandps %ymm2, %ymm9, %ymm9
2590 ; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10
2591 ; AVX-NEXT: vorps %ymm10, %ymm9, %ymm9
2592 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2593 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2594 ; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
2595 ; AVX-NEXT: vandps %ymm15, %ymm9, %ymm9
2596 ; AVX-NEXT: vorps %ymm1, %ymm9, %ymm1
2597 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2598 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
2599 ; AVX-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
2600 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
2601 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,3,2,3]
2602 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,0,3,4,5,6,7]
2603 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2604 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
2605 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1,2],xmm9[3,4,5,6,7]
2606 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2607 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
2608 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
2609 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2610 ; AVX-NEXT: vpsrlq $48, %xmm4, %xmm1
2611 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2612 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,3,2,3]
2613 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
2614 ; AVX-NEXT: vpsrlq $16, %xmm13, %xmm9
2615 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
2616 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2617 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,1,0,3]
2618 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
2619 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
2620 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2621 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2622 ; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,4,5,8,9,6,7]
2623 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm8[2,3,2,3]
2624 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2625 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2626 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2627 ; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
2628 ; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
2629 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
2630 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2631 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
2632 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
2633 ; AVX-NEXT: vpsrlq $48, %xmm6, %xmm1
2634 ; AVX-NEXT: vmovdqa %xmm6, %xmm15
2635 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2636 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2637 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2638 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2639 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
2640 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2641 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5],xmm2[6],xmm6[7]
2642 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
2643 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
2644 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
2645 ; AVX-NEXT: vpsrld $16, %xmm3, %xmm1
2646 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2647 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2648 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,6,7]
2649 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,2]
2650 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm9[2,3],xmm1[4,5,6,7]
2651 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm11[1],xmm12[2,3,4,5,6,7]
2652 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
2653 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,4,7,7]
2654 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
2655 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,2,2,3,4,5,6,7]
2656 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,2]
2657 ; AVX-NEXT: vpsrldq {{.*#+}} xmm10 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
2658 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
2659 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3,4,5,6,7]
2660 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2661 ; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
2662 ; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
2663 ; AVX-NEXT: vandps %ymm1, %ymm9, %ymm1
2664 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
2665 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2666 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2667 ; AVX-NEXT: # xmm1 = zero,xmm1[1],mem[0],zero
2668 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
2669 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3,4],xmm5[5,6,7]
2670 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm6[1],xmm2[2,3,4,5,6,7]
2671 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2672 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2673 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
2674 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[1,1,1,1]
2675 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
2676 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2677 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2678 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2679 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
2680 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
2681 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
2682 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
2683 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3]
2684 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,1,0,3]
2685 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
2686 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
2687 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,3,3,3]
2688 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
2689 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7]
2690 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2691 ; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1
2692 ; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2
2693 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
2694 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2695 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
2696 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2697 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
2698 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2699 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
2700 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2701 ; AVX-NEXT: vmovaps %ymm2, (%r8)
2702 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2703 ; AVX-NEXT: vmovaps %ymm2, (%r9)
2704 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2705 ; AVX-NEXT: vmovaps %ymm0, (%rax)
2706 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2707 ; AVX-NEXT: vmovaps %ymm1, (%rax)
2708 ; AVX-NEXT: addq $264, %rsp # imm = 0x108
2709 ; AVX-NEXT: vzeroupper
2712 ; AVX2-LABEL: load_i16_stride7_vf16:
2714 ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
2715 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
2716 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2
2717 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
2718 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm9
2719 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5
2720 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6
2721 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2722 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm4[0,1,2,3,4,6,4,7]
2723 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
2724 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
2725 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
2726 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
2727 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2728 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm9[0,1,0,2]
2729 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,1,2,1,4,5,6,5]
2730 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2731 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2732 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2733 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10
2734 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5],xmm10[6],xmm8[7]
2735 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2736 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2737 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
2738 ; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2739 ; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
2740 ; AVX2-NEXT: vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
2741 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2742 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2743 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2744 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10
2745 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
2746 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2747 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2748 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
2749 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7,8,9,10],ymm12[11],ymm10[12,13,14,15]
2750 ; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2751 ; AVX2-NEXT: vmovdqa %xmm11, %xmm10
2752 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm12, %ymm8
2753 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2754 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2755 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
2756 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2757 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2758 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2759 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
2760 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
2761 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2762 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2763 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2764 ; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm11
2765 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3,4,5],xmm11[6],xmm8[7]
2766 ; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2767 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm2[2,3,0,1]
2768 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2769 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4],ymm12[5,6,7,8,9,10,11],ymm11[12],ymm12[13,14,15]
2770 ; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2771 ; AVX2-NEXT: vpblendvb %ymm10, %ymm8, %ymm11, %ymm8
2772 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2773 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2774 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
2775 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2776 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2777 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,2]
2778 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,1,0,3,4,5,4,7]
2779 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
2780 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
2781 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
2782 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
2783 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2784 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2785 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
2786 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
2787 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
2788 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2789 ; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
2790 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
2791 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
2792 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
2793 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2794 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
2795 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7]
2796 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,2,1,0,4,5,6,7]
2797 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,7,7,7]
2798 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2799 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,3,2,3]
2800 ; AVX2-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2801 ; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm13, %ymm10
2802 ; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
2803 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2804 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2805 ; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm11
2806 ; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
2807 ; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2808 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2809 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
2810 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
2811 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
2812 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2813 ; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2814 ; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
2815 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2816 ; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14
2817 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
2818 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
2819 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
2820 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
2821 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,0,1]
2822 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7,8,9,10,11,12,13],ymm11[14],ymm12[15]
2823 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2824 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2825 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2826 ; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
2827 ; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
2828 ; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2829 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2830 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm12
2831 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13
2832 ; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
2833 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
2834 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
2835 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2836 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
2837 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
2838 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2839 ; AVX2-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
2840 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
2841 ; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm4
2842 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
2843 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
2844 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
2845 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2846 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
2847 ; AVX2-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
2848 ; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
2849 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3]
2850 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
2851 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
2852 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
2853 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
2854 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
2855 ; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
2856 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2857 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
2858 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
2859 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2860 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
2861 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
2862 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
2863 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
2864 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
2865 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
2866 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
2867 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
2868 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
2869 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2870 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
2871 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
2872 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2873 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2874 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2875 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
2876 ; AVX2-NEXT: vmovdqa %ymm7, (%rdx)
2877 ; AVX2-NEXT: vmovdqa %ymm8, (%rcx)
2878 ; AVX2-NEXT: vmovdqa %ymm9, (%r8)
2879 ; AVX2-NEXT: vmovdqa %ymm10, (%r9)
2880 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2881 ; AVX2-NEXT: vmovdqa %ymm11, (%rax)
2882 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2883 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
2884 ; AVX2-NEXT: vzeroupper
2887 ; AVX2-FP-LABEL: load_i16_stride7_vf16:
2889 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
2890 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
2891 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2
2892 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3
2893 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10
2894 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5
2895 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6
2896 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
2897 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm7
2898 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,8,9,10,11,6,7,6,7]
2899 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,4,7]
2900 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
2901 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2902 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm10[0,1,0,2]
2903 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
2904 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm8
2905 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7]
2906 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
2907 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
2908 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7]
2909 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
2910 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
2911 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
2912 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
2913 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
2914 ; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
2915 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
2916 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2917 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
2918 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
2919 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0],xmm8[1],xmm11[2,3,4,5],xmm8[6],xmm11[7]
2920 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
2921 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
2922 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
2923 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
2924 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
2925 ; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm11
2926 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm13, %ymm8
2927 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
2928 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13
2929 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
2930 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
2931 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2932 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
2933 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
2934 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
2935 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2936 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2937 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
2938 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm12
2939 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1],xmm8[2,3,4,5],xmm12[6],xmm8[7]
2940 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
2941 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
2942 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
2943 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4],ymm13[5,6,7,8,9,10,11],ymm12[12],ymm13[13,14,15]
2944 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
2945 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm12, %ymm8
2946 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
2947 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13
2948 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7]
2949 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
2950 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2951 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,1,2]
2952 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
2953 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
2954 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm12[1,2,3,4,5,6,7],ymm8[8],ymm12[9,10,11,12,13,14,15]
2955 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
2956 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
2957 ; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm13
2958 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
2959 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
2960 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm13
2961 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[0,1,1,3]
2962 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
2963 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2964 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
2965 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
2966 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
2967 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
2968 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
2969 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3]
2970 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
2971 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm13, %ymm14, %ymm11
2972 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
2973 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2974 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
2975 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
2976 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
2977 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm13
2978 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
2979 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
2980 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
2981 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
2982 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
2983 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7,8,9,10,11,12,13],ymm15[14],ymm14[15]
2984 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
2985 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
2986 ; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm4
2987 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2],xmm15[3],xmm4[4,5,6,7]
2988 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
2989 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2990 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
2991 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
2992 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm4[1,2,3,4,5,6,7],ymm14[8],ymm4[9,10,11,12,13,14,15]
2993 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm4[4,5,6,7]
2994 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
2995 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm12
2996 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7]
2997 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
2998 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2999 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12
3000 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13
3001 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
3002 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
3003 ; AVX2-FP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
3004 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3005 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15]
3006 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
3007 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm14[2,3,0,1]
3008 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm14[1,2,3,4,5,6],ymm7[7,8],ymm14[9,10,11,12,13,14],ymm7[15]
3009 ; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm7
3010 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3011 ; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15
3012 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm11
3013 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
3014 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
3015 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
3016 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
3017 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3018 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3019 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
3020 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
3021 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
3022 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3023 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
3024 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
3025 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
3026 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3027 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
3028 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
3029 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
3030 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
3031 ; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
3032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
3033 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
3034 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
3035 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
3036 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3037 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
3038 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
3039 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3040 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3041 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
3042 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3043 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
3044 ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx)
3045 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%r8)
3046 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%r9)
3047 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3048 ; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax)
3049 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3050 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
3051 ; AVX2-FP-NEXT: vzeroupper
3052 ; AVX2-FP-NEXT: retq
3054 ; AVX2-FCP-LABEL: load_i16_stride7_vf16:
3055 ; AVX2-FCP: # %bb.0:
3056 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
3057 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
3058 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
3059 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3
3060 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
3061 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
3062 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
3063 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,1,0,2]
3064 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,20,21,26,27]
3065 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9
3066 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
3067 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5]
3068 ; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1]
3069 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10
3070 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm4
3071 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm9[7]
3072 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
3073 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
3074 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7]
3075 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
3076 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
3077 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
3078 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
3079 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0]
3080 ; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm9
3081 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5,6,7]
3082 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3083 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3084 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
3085 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
3086 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
3087 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
3088 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,5,1,0,4,0,0,0]
3089 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10
3090 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
3091 ; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm10
3092 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm12, %ymm9
3093 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
3094 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
3095 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
3096 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
3097 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3098 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
3099 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5,6],ymm8[7]
3100 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
3101 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3102 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3103 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
3104 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
3105 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm11[1],xmm9[2,3,4,5],xmm11[6],xmm9[7]
3106 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
3107 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
3108 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,6,1,0,5,0,0,0]
3109 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm11
3110 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
3111 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm9, %ymm11, %ymm9
3112 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
3113 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
3114 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
3115 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3116 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3117 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5]
3118 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12
3119 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3120 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
3121 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15]
3122 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
3123 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
3124 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
3125 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
3126 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
3127 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm12
3128 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm7[0,1,1,3]
3129 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
3130 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
3131 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
3132 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
3133 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3],xmm14[4,5,6,7]
3134 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
3135 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
3136 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,3,2,3]
3137 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
3138 ; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm13, %ymm14, %ymm10
3139 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15]
3140 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
3141 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3142 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
3143 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
3144 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
3145 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
3146 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
3147 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
3148 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,7,2,6,0,0,0]
3149 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm15
3150 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
3151 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4
3152 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2],xmm12[3],xmm4[4,5,6,7]
3153 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3154 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3155 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
3156 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7]
3157 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
3158 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm11
3159 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm11[2,3]
3160 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1,2,3,4,5,6,7],ymm11[8],ymm4[9,10,11,12,13,14,15]
3161 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm4[4,5,6,7]
3162 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
3163 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [2,5,1,4,2,5,1,4]
3164 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,0,1]
3165 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm14, %ymm4
3166 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,0,3,7,0]
3167 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm14, %ymm14
3168 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
3169 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm14
3170 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4
3171 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15]
3172 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
3173 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,7,3,6,0,0,0]
3174 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14
3175 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3176 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm8
3177 ; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8
3178 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
3179 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
3180 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
3181 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4,5,6,7],ymm13[8],ymm4[9,10,11,12,13,14,15]
3182 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm13[2,3]
3183 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3],ymm4[4,5,6,7]
3184 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,7,0,0,4,7,0]
3185 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3186 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
3187 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3188 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5]
3189 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3190 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
3191 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
3192 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
3193 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
3194 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7],ymm5[8,9,10,11,12],ymm4[13,14,15]
3195 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
3196 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,0,3,7,0,0,0]
3197 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
3198 ; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
3199 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
3200 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
3201 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
3202 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
3203 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3204 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
3205 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
3206 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3207 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3208 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
3209 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3210 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
3211 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx)
3212 ; AVX2-FCP-NEXT: vmovdqa %ymm10, (%r8)
3213 ; AVX2-FCP-NEXT: vmovdqa %ymm11, (%r9)
3214 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3215 ; AVX2-FCP-NEXT: vmovdqa %ymm13, (%rax)
3216 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3217 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
3218 ; AVX2-FCP-NEXT: vzeroupper
3219 ; AVX2-FCP-NEXT: retq
3221 ; AVX512-LABEL: load_i16_stride7_vf16:
3223 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
3224 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
3225 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
3226 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm3
3227 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
3228 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3229 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
3230 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
3231 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
3232 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
3233 ; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u]
3234 ; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
3235 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5
3236 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6
3237 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
3238 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,4,7]
3239 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
3240 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
3241 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7]
3242 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
3243 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3244 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9
3245 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,2]
3246 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5]
3247 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
3248 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
3249 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
3250 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
3251 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
3252 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm10
3253 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7]
3254 ; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
3255 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3256 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
3257 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
3258 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
3259 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm8
3260 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
3261 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15]
3262 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3263 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3264 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
3265 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
3266 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3267 ; AVX512-NEXT: vpor %ymm8, %ymm10, %ymm8
3268 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
3269 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3270 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
3271 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm10
3272 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
3273 ; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3274 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3275 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2]
3276 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
3277 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3278 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
3279 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
3280 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
3281 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15]
3282 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3283 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
3284 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
3285 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
3286 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3287 ; AVX512-NEXT: vpor %ymm10, %ymm11, %ymm10
3288 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
3289 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
3290 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
3291 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3]
3292 ; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3293 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
3294 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
3295 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
3296 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3297 ; AVX512-NEXT: vpor %ymm10, %ymm11, %ymm10
3298 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
3299 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
3300 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
3301 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
3302 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
3303 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3304 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
3305 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
3306 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
3307 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
3308 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
3309 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3310 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
3311 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
3312 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
3313 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3314 ; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
3315 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
3316 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
3317 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
3318 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
3319 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3320 ; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
3321 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15]
3322 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3323 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13
3324 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
3325 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
3326 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
3327 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
3328 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
3329 ; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
3330 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3331 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
3332 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
3333 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
3334 ; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
3335 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3336 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm12
3337 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
3338 ; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
3339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
3340 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
3341 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3342 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
3343 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
3344 ; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
3345 ; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
3346 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3347 ; AVX512-NEXT: vextracti32x4 $1, %ymm15, %xmm16
3348 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm16[2,1,2,3]
3349 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
3350 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
3351 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
3352 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
3353 ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
3354 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7]
3355 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15]
3356 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
3357 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3358 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
3359 ; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
3360 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
3361 ; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
3362 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3363 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
3364 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
3365 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3366 ; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
3367 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
3368 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
3369 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
3370 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
3371 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
3372 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
3373 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3374 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
3375 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3376 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
3377 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
3378 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
3379 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3380 ; AVX512-NEXT: vmovdqa64 %ymm17, (%rsi)
3381 ; AVX512-NEXT: vmovdqa %ymm7, (%rdx)
3382 ; AVX512-NEXT: vmovdqa %ymm8, (%rcx)
3383 ; AVX512-NEXT: vmovdqa %ymm9, (%r8)
3384 ; AVX512-NEXT: vmovdqa %ymm10, (%r9)
3385 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3386 ; AVX512-NEXT: vmovdqa %ymm11, (%rax)
3387 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3388 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
3389 ; AVX512-NEXT: vzeroupper
3392 ; AVX512-FCP-LABEL: load_i16_stride7_vf16:
3393 ; AVX512-FCP: # %bb.0:
3394 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
3395 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
3396 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3397 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
3398 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
3399 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
3400 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9
3401 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
3402 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7
3403 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
3404 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
3405 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
3406 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
3407 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
3408 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
3409 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
3410 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6
3411 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
3412 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
3413 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
3414 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
3415 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
3416 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
3417 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
3418 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
3419 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
3420 ; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
3421 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
3422 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
3423 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
3424 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
3425 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
3426 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
3427 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
3428 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3429 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
3430 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3431 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3432 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
3433 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
3434 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
3435 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3436 ; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
3437 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
3438 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
3439 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
3440 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
3441 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
3442 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3443 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3444 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
3445 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14
3446 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3447 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3448 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3449 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
3450 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
3451 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
3452 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3453 ; AVX512-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9
3454 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
3455 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
3456 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
3457 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
3458 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
3459 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
3460 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3461 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
3462 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
3463 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3464 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
3465 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8
3466 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
3467 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10
3468 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3469 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3470 ; AVX512-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8
3471 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14
3472 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
3473 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
3474 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
3475 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1
3476 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
3477 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
3478 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
3479 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3480 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
3481 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
3482 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
3483 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
3484 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
3485 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13
3486 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
3487 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
3488 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
3489 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
3490 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3491 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
3492 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
3493 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
3494 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
3495 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
3496 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
3497 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
3498 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13
3499 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
3500 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3501 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
3502 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12
3503 ; AVX512-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2
3504 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
3505 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
3506 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
3507 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13
3508 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
3509 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
3510 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
3511 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
3512 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
3513 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
3514 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5
3515 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
3516 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
3517 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
3518 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
3519 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
3520 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0
3521 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3522 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
3523 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
3524 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
3525 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
3526 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3527 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
3528 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
3529 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3530 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
3531 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx)
3532 ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rcx)
3533 ; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8)
3534 ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r9)
3535 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3536 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
3537 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3538 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
3539 ; AVX512-FCP-NEXT: vzeroupper
3540 ; AVX512-FCP-NEXT: retq
3542 ; AVX512DQ-LABEL: load_i16_stride7_vf16:
3543 ; AVX512DQ: # %bb.0:
3544 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
3545 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
3546 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2
3547 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm3
3548 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
3549 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
3550 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
3551 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
3552 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
3553 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
3554 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u,u,u,u,u,u,u]
3555 ; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
3556 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
3557 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6
3558 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
3559 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,6,4,7]
3560 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7
3561 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
3562 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,7,7]
3563 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
3564 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3565 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9
3566 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,1,0,2]
3567 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm8[0,1,2,1,4,5,6,5]
3568 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
3569 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm10[7]
3570 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
3571 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
3572 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
3573 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10
3574 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6],xmm7[7]
3575 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
3576 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3577 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,5,7]
3578 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
3579 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7]
3580 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm8
3581 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
3582 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3],ymm10[4,5,6,7,8,9,10],ymm8[11],ymm10[12,13,14,15]
3583 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3584 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
3585 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
3586 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
3587 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3588 ; AVX512DQ-NEXT: vpor %ymm8, %ymm10, %ymm8
3589 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
3590 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3591 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
3592 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm10
3593 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1],xmm10[2,3,4,5],xmm8[6],xmm10[7]
3594 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3595 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
3596 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[0,1,1,2]
3597 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,0,3,4,5,4,7]
3598 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3599 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7]
3600 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
3601 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
3602 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5,6,7,8,9,10,11],ymm11[12],ymm10[13,14,15]
3603 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3604 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
3605 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
3606 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
3607 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3608 ; AVX512DQ-NEXT: vpor %ymm10, %ymm11, %ymm10
3609 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1,2,3,4,5,6,7],ymm10[8],ymm8[9,10,11,12,13,14,15]
3610 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
3611 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
3612 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,2,3]
3613 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3614 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
3615 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
3616 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
3617 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3618 ; AVX512DQ-NEXT: vpor %ymm10, %ymm11, %ymm10
3619 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
3620 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
3621 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
3622 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,1,0,4,5,6,7]
3623 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,6,7]
3624 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3625 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm9[0,1,1,3]
3626 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm12[0,1,2,0,4,5,6,4]
3627 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
3628 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
3629 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
3630 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3631 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
3632 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
3633 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
3634 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3635 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
3636 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm12[0,1,2,1,4,5,6,5]
3637 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
3638 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
3639 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
3640 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
3641 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
3642 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6],ymm11[7,8,9,10,11,12,13],ymm12[14],ymm11[15]
3643 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
3644 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
3645 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
3646 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
3647 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
3648 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
3649 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
3650 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
3651 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3652 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
3653 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
3654 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
3655 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
3656 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
3657 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm12
3658 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm13
3659 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
3660 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
3661 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
3662 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3663 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7],ymm11[8,9,10,11,12],ymm14[13,14,15]
3664 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
3665 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
3666 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1,2,3,4,5,6],ymm15[7,8],ymm14[9,10,11,12,13,14],ymm15[15]
3667 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
3668 ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm15, %xmm16
3669 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm16[2,1,2,3]
3670 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
3671 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
3672 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
3673 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
3674 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
3675 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3,4,5,6,7]
3676 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm11[1,2,3,4,5,6,7],ymm4[8],ymm11[9,10,11,12,13,14,15]
3677 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2,3],ymm11[4,5,6,7]
3678 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
3679 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
3680 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
3681 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
3682 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
3683 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3684 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
3685 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
3686 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3687 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
3688 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
3689 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
3690 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
3691 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
3692 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
3693 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
3694 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
3695 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
3696 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3697 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
3698 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
3699 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3,4,5,6,7],ymm0[8],ymm4[9,10,11,12,13,14,15]
3700 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3701 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, (%rsi)
3702 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx)
3703 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%rcx)
3704 ; AVX512DQ-NEXT: vmovdqa %ymm9, (%r8)
3705 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%r9)
3706 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3707 ; AVX512DQ-NEXT: vmovdqa %ymm11, (%rax)
3708 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3709 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
3710 ; AVX512DQ-NEXT: vzeroupper
3711 ; AVX512DQ-NEXT: retq
3713 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16:
3714 ; AVX512DQ-FCP: # %bb.0:
3715 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
3716 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
3717 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3718 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
3719 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
3720 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
3721 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9
3722 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
3723 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7
3724 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
3725 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
3726 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
3727 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
3728 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
3729 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
3730 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
3731 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6
3732 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
3733 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
3734 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
3735 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
3736 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
3737 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
3738 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
3739 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
3740 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
3741 ; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
3742 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
3743 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
3744 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
3745 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
3746 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
3747 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
3748 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
3749 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3750 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
3751 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3752 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3753 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
3754 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
3755 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
3756 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3757 ; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
3758 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
3759 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
3760 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
3761 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
3762 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
3763 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
3764 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3765 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
3766 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14
3767 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
3768 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3769 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3770 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
3771 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
3772 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
3773 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3774 ; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9
3775 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
3776 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
3777 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
3778 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
3779 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
3780 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
3781 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
3782 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
3783 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
3784 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
3785 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
3786 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8
3787 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
3788 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10
3789 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
3790 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
3791 ; AVX512DQ-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8
3792 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14
3793 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
3794 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
3795 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
3796 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1
3797 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
3798 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
3799 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
3800 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
3801 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
3802 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
3803 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
3804 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
3805 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
3806 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13
3807 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
3808 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
3809 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
3810 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
3811 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
3812 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
3813 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
3814 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
3815 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
3816 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
3817 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
3818 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
3819 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13
3820 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
3821 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
3822 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
3823 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12
3824 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2
3825 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
3826 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
3827 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
3828 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13
3829 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
3830 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
3831 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
3832 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
3833 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
3834 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1]
3835 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5
3836 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
3837 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
3838 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
3839 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
3840 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
3841 ; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0
3842 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
3843 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
3844 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
3845 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
3846 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
3847 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
3848 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
3849 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
3850 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3851 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
3852 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx)
3853 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rcx)
3854 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8)
3855 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r9)
3856 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3857 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
3858 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3859 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
3860 ; AVX512DQ-FCP-NEXT: vzeroupper
3861 ; AVX512DQ-FCP-NEXT: retq
3863 ; AVX512BW-LABEL: load_i16_stride7_vf16:
3864 ; AVX512BW: # %bb.0:
3865 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3866 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3867 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3868 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3869 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3870 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3871 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
3872 ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
3873 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3874 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
3875 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3876 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
3877 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
3878 ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
3879 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3880 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
3881 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3882 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
3883 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3884 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
3885 ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
3886 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3887 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
3888 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3889 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
3890 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3891 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
3892 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
3893 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3894 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
3895 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3896 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
3897 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3898 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
3899 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
3900 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3901 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
3902 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
3903 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
3904 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3905 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
3906 ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
3907 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3908 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
3909 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
3910 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
3911 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3912 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
3913 ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
3914 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3915 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
3916 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3917 ; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
3918 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3919 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3920 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3921 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
3922 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
3923 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
3924 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r10)
3925 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
3926 ; AVX512BW-NEXT: vzeroupper
3927 ; AVX512BW-NEXT: retq
3929 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf16:
3930 ; AVX512BW-FCP: # %bb.0:
3931 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3932 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3933 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3934 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3935 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3936 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3937 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
3938 ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3939 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3940 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
3941 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3942 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
3943 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
3944 ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
3945 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3946 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
3947 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3948 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
3949 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3950 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
3951 ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3952 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3953 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
3954 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3955 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
3956 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3957 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
3958 ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3959 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3960 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
3961 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3962 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
3963 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3964 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
3965 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3966 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3967 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
3968 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
3969 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
3970 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3971 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
3972 ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
3973 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3974 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
3975 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
3976 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
3977 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3978 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
3979 ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
3980 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3981 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
3982 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
3983 ; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
3984 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3985 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3986 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3987 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
3988 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
3989 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
3990 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r10)
3991 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
3992 ; AVX512BW-FCP-NEXT: vzeroupper
3993 ; AVX512BW-FCP-NEXT: retq
3995 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf16:
3996 ; AVX512DQ-BW: # %bb.0:
3997 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3998 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3999 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
4000 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
4001 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
4002 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
4003 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
4004 ; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1]
4005 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
4006 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
4007 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
4008 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
4009 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
4010 ; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1]
4011 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
4012 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
4013 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
4014 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
4015 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
4016 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
4017 ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1]
4018 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4019 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
4020 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
4021 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
4022 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4023 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
4024 ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
4025 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
4026 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
4027 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
4028 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
4029 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4030 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
4031 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1]
4032 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4033 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
4034 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
4035 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
4036 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
4037 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
4038 ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1]
4039 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
4040 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
4041 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
4042 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
4043 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4044 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
4045 ; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1]
4046 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
4047 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
4048 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
4049 ; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
4050 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4051 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi)
4052 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx)
4053 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx)
4054 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8)
4055 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9)
4056 ; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r10)
4057 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
4058 ; AVX512DQ-BW-NEXT: vzeroupper
4059 ; AVX512DQ-BW-NEXT: retq
4061 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf16:
4062 ; AVX512DQ-BW-FCP: # %bb.0:
4063 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4064 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
4066 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
4067 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
4068 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
4069 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
4070 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
4071 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
4072 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
4073 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
4074 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
4075 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
4076 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
4077 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
4078 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
4079 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
4080 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
4081 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
4082 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
4083 ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
4084 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
4085 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
4086 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
4087 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
4088 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
4089 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
4090 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
4091 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
4092 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
4093 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
4094 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
4095 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
4096 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
4097 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
4098 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
4099 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
4100 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm9
4101 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
4102 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
4103 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
4104 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
4105 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
4106 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
4107 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm10
4108 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
4109 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4110 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
4111 ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
4112 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
4113 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
4114 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
4115 ; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
4116 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4117 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
4118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
4119 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
4120 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
4121 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
4122 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r10)
4123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
4124 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4125 ; AVX512DQ-BW-FCP-NEXT: retq
4126 %wide.vec = load <112 x i16>, ptr %in.vec, align 64
4127 %strided.vec0 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
4128 %strided.vec1 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
4129 %strided.vec2 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
4130 %strided.vec3 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
4131 %strided.vec4 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
4132 %strided.vec5 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
4133 %strided.vec6 = shufflevector <112 x i16> %wide.vec, <112 x i16> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
4134 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
4135 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
4136 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
4137 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
4138 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
4139 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
4140 store <16 x i16> %strided.vec6, ptr %out.vec6, align 64
4144 define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
4145 ; SSE-LABEL: load_i16_stride7_vf32:
4147 ; SSE-NEXT: subq $600, %rsp # imm = 0x258
4148 ; SSE-NEXT: movdqa 304(%rdi), %xmm5
4149 ; SSE-NEXT: movdqa 288(%rdi), %xmm6
4150 ; SSE-NEXT: movdqa 112(%rdi), %xmm13
4151 ; SSE-NEXT: movdqa 128(%rdi), %xmm8
4152 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4153 ; SSE-NEXT: movaps 160(%rdi), %xmm7
4154 ; SSE-NEXT: movaps 144(%rdi), %xmm10
4155 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4156 ; SSE-NEXT: movdqa 192(%rdi), %xmm9
4157 ; SSE-NEXT: movdqa 176(%rdi), %xmm12
4158 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
4159 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
4160 ; SSE-NEXT: movdqa %xmm1, %xmm11
4161 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4162 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,65535,0]
4163 ; SSE-NEXT: movdqa %xmm2, %xmm1
4164 ; SSE-NEXT: pandn %xmm0, %xmm1
4165 ; SSE-NEXT: movdqa %xmm12, %xmm0
4166 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4167 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
4168 ; SSE-NEXT: pand %xmm2, %xmm0
4169 ; SSE-NEXT: por %xmm1, %xmm0
4170 ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,0,0,0]
4171 ; SSE-NEXT: movdqa %xmm15, %xmm1
4172 ; SSE-NEXT: pandn %xmm0, %xmm1
4173 ; SSE-NEXT: movaps %xmm10, %xmm0
4174 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm7[2,2]
4175 ; SSE-NEXT: movaps %xmm7, %xmm10
4176 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4177 ; SSE-NEXT: movaps {{.*#+}} xmm14 = [65535,65535,65535,0,0,65535,65535,65535]
4178 ; SSE-NEXT: movaps %xmm14, %xmm3
4179 ; SSE-NEXT: andnps %xmm0, %xmm3
4180 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
4181 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
4182 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4183 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
4184 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4185 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
4186 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4187 ; SSE-NEXT: pand %xmm14, %xmm4
4188 ; SSE-NEXT: por %xmm3, %xmm4
4189 ; SSE-NEXT: pand %xmm15, %xmm4
4190 ; SSE-NEXT: por %xmm1, %xmm4
4191 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4192 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4193 ; SSE-NEXT: movdqa %xmm2, %xmm1
4194 ; SSE-NEXT: pandn %xmm0, %xmm1
4195 ; SSE-NEXT: movdqa %xmm6, %xmm0
4196 ; SSE-NEXT: movdqa %xmm6, %xmm7
4197 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4198 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
4199 ; SSE-NEXT: movdqa %xmm5, %xmm6
4200 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4201 ; SSE-NEXT: pand %xmm2, %xmm0
4202 ; SSE-NEXT: por %xmm1, %xmm0
4203 ; SSE-NEXT: movdqa %xmm15, %xmm1
4204 ; SSE-NEXT: pandn %xmm0, %xmm1
4205 ; SSE-NEXT: movaps 272(%rdi), %xmm3
4206 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4207 ; SSE-NEXT: movaps 256(%rdi), %xmm0
4208 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4209 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
4210 ; SSE-NEXT: movaps %xmm14, %xmm3
4211 ; SSE-NEXT: andnps %xmm0, %xmm3
4212 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
4213 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4214 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4215 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
4216 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
4217 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4218 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
4219 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4220 ; SSE-NEXT: pand %xmm14, %xmm4
4221 ; SSE-NEXT: por %xmm3, %xmm4
4222 ; SSE-NEXT: pand %xmm15, %xmm4
4223 ; SSE-NEXT: por %xmm1, %xmm4
4224 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4225 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
4226 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4227 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4228 ; SSE-NEXT: movdqa %xmm2, %xmm1
4229 ; SSE-NEXT: pandn %xmm0, %xmm1
4230 ; SSE-NEXT: movdqa 416(%rdi), %xmm3
4231 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4232 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
4233 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
4234 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
4235 ; SSE-NEXT: pand %xmm2, %xmm0
4236 ; SSE-NEXT: por %xmm1, %xmm0
4237 ; SSE-NEXT: movdqa %xmm15, %xmm1
4238 ; SSE-NEXT: pandn %xmm0, %xmm1
4239 ; SSE-NEXT: movaps 384(%rdi), %xmm3
4240 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4241 ; SSE-NEXT: movaps 368(%rdi), %xmm0
4242 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4243 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm3[2,2]
4244 ; SSE-NEXT: movaps %xmm14, %xmm3
4245 ; SSE-NEXT: andnps %xmm0, %xmm3
4246 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
4247 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4248 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4249 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
4250 ; SSE-NEXT: movdqa 352(%rdi), %xmm8
4251 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,3,3]
4252 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4253 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4254 ; SSE-NEXT: pand %xmm14, %xmm4
4255 ; SSE-NEXT: por %xmm3, %xmm4
4256 ; SSE-NEXT: pand %xmm15, %xmm4
4257 ; SSE-NEXT: por %xmm1, %xmm4
4258 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4259 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
4260 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4261 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4262 ; SSE-NEXT: movdqa %xmm2, %xmm1
4263 ; SSE-NEXT: pandn %xmm0, %xmm1
4264 ; SSE-NEXT: movdqa 80(%rdi), %xmm4
4265 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4266 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
4267 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4268 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4269 ; SSE-NEXT: pand %xmm2, %xmm0
4270 ; SSE-NEXT: por %xmm1, %xmm0
4271 ; SSE-NEXT: movdqa %xmm15, %xmm1
4272 ; SSE-NEXT: pandn %xmm0, %xmm1
4273 ; SSE-NEXT: movaps 32(%rdi), %xmm0
4274 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4275 ; SSE-NEXT: movaps 48(%rdi), %xmm4
4276 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4277 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
4278 ; SSE-NEXT: movaps %xmm14, %xmm3
4279 ; SSE-NEXT: andnps %xmm0, %xmm3
4280 ; SSE-NEXT: movdqa (%rdi), %xmm0
4281 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4282 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
4283 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
4284 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
4285 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4286 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
4287 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
4288 ; SSE-NEXT: pand %xmm14, %xmm4
4289 ; SSE-NEXT: por %xmm3, %xmm4
4290 ; SSE-NEXT: pand %xmm15, %xmm4
4291 ; SSE-NEXT: por %xmm1, %xmm4
4292 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4293 ; SSE-NEXT: movdqa %xmm11, %xmm0
4294 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4295 ; SSE-NEXT: movdqa %xmm2, %xmm1
4296 ; SSE-NEXT: pandn %xmm0, %xmm1
4297 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4298 ; SSE-NEXT: movdqa %xmm9, %xmm0
4299 ; SSE-NEXT: psrld $16, %xmm0
4300 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4301 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
4302 ; SSE-NEXT: pand %xmm2, %xmm12
4303 ; SSE-NEXT: por %xmm1, %xmm12
4304 ; SSE-NEXT: movdqa %xmm15, %xmm0
4305 ; SSE-NEXT: pandn %xmm12, %xmm0
4306 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,65535,65535,65535,65535]
4307 ; SSE-NEXT: movdqa %xmm11, %xmm1
4308 ; SSE-NEXT: pandn %xmm13, %xmm1
4309 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4310 ; SSE-NEXT: movdqa %xmm13, %xmm4
4311 ; SSE-NEXT: pand %xmm11, %xmm4
4312 ; SSE-NEXT: por %xmm1, %xmm4
4313 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4314 ; SSE-NEXT: movdqa %xmm12, %xmm1
4315 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
4316 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
4317 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
4318 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4319 ; SSE-NEXT: movdqa %xmm14, %xmm3
4320 ; SSE-NEXT: pandn %xmm1, %xmm3
4321 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
4322 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4323 ; SSE-NEXT: pand %xmm14, %xmm1
4324 ; SSE-NEXT: por %xmm1, %xmm3
4325 ; SSE-NEXT: pand %xmm15, %xmm3
4326 ; SSE-NEXT: por %xmm0, %xmm3
4327 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4328 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4329 ; SSE-NEXT: movdqa %xmm5, %xmm0
4330 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4331 ; SSE-NEXT: movdqa %xmm2, %xmm1
4332 ; SSE-NEXT: pandn %xmm0, %xmm1
4333 ; SSE-NEXT: psrld $16, %xmm6
4334 ; SSE-NEXT: psrldq {{.*#+}} xmm7 = xmm7[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4335 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
4336 ; SSE-NEXT: pand %xmm2, %xmm7
4337 ; SSE-NEXT: por %xmm1, %xmm7
4338 ; SSE-NEXT: movdqa %xmm15, %xmm0
4339 ; SSE-NEXT: pandn %xmm7, %xmm0
4340 ; SSE-NEXT: movdqa %xmm11, %xmm1
4341 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4342 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4343 ; SSE-NEXT: pand %xmm11, %xmm4
4344 ; SSE-NEXT: por %xmm1, %xmm4
4345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4346 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4347 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4348 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
4349 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
4350 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4351 ; SSE-NEXT: movdqa %xmm14, %xmm3
4352 ; SSE-NEXT: pandn %xmm1, %xmm3
4353 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
4354 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4355 ; SSE-NEXT: pand %xmm14, %xmm1
4356 ; SSE-NEXT: por %xmm1, %xmm3
4357 ; SSE-NEXT: pand %xmm15, %xmm3
4358 ; SSE-NEXT: por %xmm0, %xmm3
4359 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4360 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4361 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4362 ; SSE-NEXT: movdqa %xmm2, %xmm1
4363 ; SSE-NEXT: pandn %xmm0, %xmm1
4364 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4365 ; SSE-NEXT: psrld $16, %xmm0
4366 ; SSE-NEXT: movdqa (%rsp), %xmm7 # 16-byte Reload
4367 ; SSE-NEXT: movdqa %xmm7, %xmm4
4368 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4369 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4370 ; SSE-NEXT: pand %xmm2, %xmm4
4371 ; SSE-NEXT: por %xmm1, %xmm4
4372 ; SSE-NEXT: movdqa %xmm15, %xmm0
4373 ; SSE-NEXT: pandn %xmm4, %xmm0
4374 ; SSE-NEXT: movdqa %xmm11, %xmm1
4375 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4376 ; SSE-NEXT: movdqa %xmm8, %xmm4
4377 ; SSE-NEXT: pand %xmm11, %xmm4
4378 ; SSE-NEXT: por %xmm1, %xmm4
4379 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4380 ; SSE-NEXT: movdqa %xmm3, %xmm1
4381 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4382 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4383 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,1]
4384 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
4385 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
4386 ; SSE-NEXT: movdqa %xmm14, %xmm8
4387 ; SSE-NEXT: pandn %xmm1, %xmm8
4388 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
4389 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4390 ; SSE-NEXT: pand %xmm14, %xmm1
4391 ; SSE-NEXT: por %xmm1, %xmm8
4392 ; SSE-NEXT: pand %xmm15, %xmm8
4393 ; SSE-NEXT: por %xmm0, %xmm8
4394 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4395 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4396 ; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
4397 ; SSE-NEXT: movdqa %xmm2, %xmm1
4398 ; SSE-NEXT: pandn %xmm0, %xmm1
4399 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4400 ; SSE-NEXT: psrld $16, %xmm0
4401 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4402 ; SSE-NEXT: movdqa %xmm10, %xmm4
4403 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
4404 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4405 ; SSE-NEXT: pand %xmm2, %xmm4
4406 ; SSE-NEXT: por %xmm1, %xmm4
4407 ; SSE-NEXT: movdqa %xmm15, %xmm0
4408 ; SSE-NEXT: pandn %xmm4, %xmm0
4409 ; SSE-NEXT: movdqa %xmm11, %xmm1
4410 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4411 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4412 ; SSE-NEXT: pand %xmm11, %xmm4
4413 ; SSE-NEXT: por %xmm1, %xmm4
4414 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,3,2,3]
4415 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7]
4416 ; SSE-NEXT: pand %xmm14, %xmm1
4417 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4418 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4419 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
4420 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
4421 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
4422 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
4423 ; SSE-NEXT: pandn %xmm4, %xmm14
4424 ; SSE-NEXT: por %xmm1, %xmm14
4425 ; SSE-NEXT: pand %xmm15, %xmm14
4426 ; SSE-NEXT: por %xmm0, %xmm14
4427 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4428 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4429 ; SSE-NEXT: # xmm0 = mem[0,1,0,1]
4430 ; SSE-NEXT: movdqa %xmm2, %xmm1
4431 ; SSE-NEXT: pandn %xmm0, %xmm1
4432 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4433 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
4434 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
4435 ; SSE-NEXT: pand %xmm2, %xmm0
4436 ; SSE-NEXT: por %xmm1, %xmm0
4437 ; SSE-NEXT: movdqa %xmm15, %xmm1
4438 ; SSE-NEXT: pandn %xmm0, %xmm1
4439 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,1,0,3]
4440 ; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,4,7]
4441 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4442 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,3,3]
4443 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm0[1]
4444 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4445 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3]
4446 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,1,2,3]
4447 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4448 ; SSE-NEXT: movss {{.*#+}} xmm8 = xmm4[0],xmm8[1,2,3]
4449 ; SSE-NEXT: andps %xmm15, %xmm8
4450 ; SSE-NEXT: orps %xmm1, %xmm8
4451 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4452 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1]
4453 ; SSE-NEXT: movdqa %xmm2, %xmm4
4454 ; SSE-NEXT: pandn %xmm1, %xmm4
4455 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4456 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4457 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
4458 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4459 ; SSE-NEXT: pand %xmm2, %xmm1
4460 ; SSE-NEXT: por %xmm4, %xmm1
4461 ; SSE-NEXT: movdqa %xmm15, %xmm4
4462 ; SSE-NEXT: pandn %xmm1, %xmm4
4463 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4464 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
4465 ; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,4,5,4,7]
4466 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4467 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,3,3]
4468 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm14 = xmm14[1],xmm1[1]
4469 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4470 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4471 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4472 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
4473 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
4474 ; SSE-NEXT: movss {{.*#+}} xmm14 = xmm5[0],xmm14[1,2,3]
4475 ; SSE-NEXT: andps %xmm15, %xmm14
4476 ; SSE-NEXT: orps %xmm4, %xmm14
4477 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4478 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4479 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
4480 ; SSE-NEXT: movdqa %xmm2, %xmm5
4481 ; SSE-NEXT: pandn %xmm4, %xmm5
4482 ; SSE-NEXT: movdqa %xmm7, %xmm4
4483 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4484 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4485 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
4486 ; SSE-NEXT: pand %xmm2, %xmm4
4487 ; SSE-NEXT: por %xmm5, %xmm4
4488 ; SSE-NEXT: movdqa %xmm15, %xmm7
4489 ; SSE-NEXT: pandn %xmm4, %xmm7
4490 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,0,3]
4491 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5,4,7]
4492 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4493 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
4494 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1]
4495 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4496 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4497 ; SSE-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
4498 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,2,3]
4499 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
4500 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3]
4501 ; SSE-NEXT: andps %xmm15, %xmm3
4502 ; SSE-NEXT: orps %xmm7, %xmm3
4503 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4504 ; SSE-NEXT: movdqa %xmm10, %xmm4
4505 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4506 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4507 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
4508 ; SSE-NEXT: pand %xmm2, %xmm4
4509 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4510 ; SSE-NEXT: # xmm7 = mem[0,1,0,1]
4511 ; SSE-NEXT: pandn %xmm7, %xmm2
4512 ; SSE-NEXT: por %xmm4, %xmm2
4513 ; SSE-NEXT: movdqa %xmm15, %xmm4
4514 ; SSE-NEXT: pandn %xmm2, %xmm4
4515 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4516 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,3]
4517 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
4518 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4519 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[2,2,3,3]
4520 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
4521 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4522 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4523 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
4524 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,1,2,3]
4525 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
4526 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm7[0],xmm0[1,2,3]
4527 ; SSE-NEXT: andps %xmm15, %xmm0
4528 ; SSE-NEXT: orps %xmm4, %xmm0
4529 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4530 ; SSE-NEXT: movdqa %xmm11, %xmm4
4531 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4532 ; SSE-NEXT: movdqa %xmm9, %xmm7
4533 ; SSE-NEXT: pand %xmm11, %xmm7
4534 ; SSE-NEXT: por %xmm4, %xmm7
4535 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm7[0,1,1,0,4,5,6,7]
4536 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
4537 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,6,5,6,7]
4538 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
4539 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm0[0],xmm4[1,2,3]
4540 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4541 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4542 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
4543 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
4544 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,7,7]
4545 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4546 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
4547 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
4548 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
4549 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
4550 ; SSE-NEXT: movdqa %xmm15, %xmm9
4551 ; SSE-NEXT: pandn %xmm0, %xmm9
4552 ; SSE-NEXT: andps %xmm15, %xmm4
4553 ; SSE-NEXT: por %xmm4, %xmm9
4554 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4555 ; SSE-NEXT: movdqa %xmm11, %xmm0
4556 ; SSE-NEXT: pandn %xmm8, %xmm0
4557 ; SSE-NEXT: movdqa %xmm12, %xmm4
4558 ; SSE-NEXT: pand %xmm11, %xmm4
4559 ; SSE-NEXT: por %xmm0, %xmm4
4560 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,0,4,5,6,7]
4561 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4562 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
4563 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4564 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
4565 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4566 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
4567 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4568 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
4569 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4570 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
4571 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4572 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4573 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
4574 ; SSE-NEXT: movdqa %xmm15, %xmm4
4575 ; SSE-NEXT: pandn %xmm1, %xmm4
4576 ; SSE-NEXT: andps %xmm15, %xmm0
4577 ; SSE-NEXT: por %xmm0, %xmm4
4578 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579 ; SSE-NEXT: movdqa %xmm11, %xmm0
4580 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4581 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4582 ; SSE-NEXT: movdqa %xmm12, %xmm1
4583 ; SSE-NEXT: pand %xmm11, %xmm1
4584 ; SSE-NEXT: por %xmm0, %xmm1
4585 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
4586 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4587 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,7]
4588 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4589 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
4590 ; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
4591 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4592 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4593 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4594 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
4595 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4596 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4597 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4598 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4599 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
4600 ; SSE-NEXT: movdqa %xmm15, %xmm4
4601 ; SSE-NEXT: pandn %xmm1, %xmm4
4602 ; SSE-NEXT: andps %xmm15, %xmm0
4603 ; SSE-NEXT: por %xmm0, %xmm4
4604 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4605 ; SSE-NEXT: movdqa %xmm11, %xmm0
4606 ; SSE-NEXT: pandn %xmm3, %xmm0
4607 ; SSE-NEXT: movdqa %xmm14, %xmm1
4608 ; SSE-NEXT: pand %xmm11, %xmm1
4609 ; SSE-NEXT: por %xmm0, %xmm1
4610 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,0,4,5,6,7]
4611 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
4612 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,7]
4613 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
4614 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
4615 ; SSE-NEXT: movdqa %xmm10, %xmm2
4616 ; SSE-NEXT: movdqa %xmm10, %xmm1
4617 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4618 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4619 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4620 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
4621 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4622 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
4623 ; SSE-NEXT: andps %xmm15, %xmm0
4624 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
4625 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
4626 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
4627 ; SSE-NEXT: pandn %xmm1, %xmm15
4628 ; SSE-NEXT: por %xmm0, %xmm15
4629 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4630 ; SSE-NEXT: movdqa %xmm9, %xmm0
4631 ; SSE-NEXT: psrld $16, %xmm0
4632 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4633 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
4634 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4635 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
4636 ; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
4637 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4638 ; SSE-NEXT: psrlq $48, %xmm1
4639 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4640 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4641 ; SSE-NEXT: movdqa %xmm4, %xmm1
4642 ; SSE-NEXT: psrlq $16, %xmm1
4643 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4644 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
4645 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
4646 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4647 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4648 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
4649 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4650 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
4651 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
4652 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
4653 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4654 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4655 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
4656 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4657 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4658 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4659 ; SSE-NEXT: movdqa %xmm1, %xmm0
4660 ; SSE-NEXT: psrld $16, %xmm0
4661 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4662 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
4663 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4664 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
4665 ; SSE-NEXT: movdqa %xmm1, %xmm6
4666 ; SSE-NEXT: movdqa %xmm7, %xmm1
4667 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
4668 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4669 ; SSE-NEXT: psrlq $48, %xmm1
4670 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4671 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4672 ; SSE-NEXT: movdqa %xmm14, %xmm1
4673 ; SSE-NEXT: psrlq $16, %xmm1
4674 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4675 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
4676 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
4677 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4678 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4679 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
4680 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4681 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
4682 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
4683 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
4684 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4685 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4686 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
4687 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4688 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4689 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4690 ; SSE-NEXT: movdqa %xmm1, %xmm7
4691 ; SSE-NEXT: psrld $16, %xmm7
4692 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4693 ; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
4694 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
4695 ; SSE-NEXT: movdqa %xmm1, %xmm6
4696 ; SSE-NEXT: movdqa %xmm8, %xmm1
4697 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
4698 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4699 ; SSE-NEXT: psrlq $48, %xmm1
4700 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4701 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4702 ; SSE-NEXT: movdqa %xmm13, %xmm1
4703 ; SSE-NEXT: psrlq $16, %xmm1
4704 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3]
4705 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
4706 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4707 ; SSE-NEXT: pshufd $196, (%rsp), %xmm1 # 16-byte Folded Reload
4708 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
4709 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4710 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4711 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
4712 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4713 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4714 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
4715 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4716 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4717 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
4718 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4719 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4720 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4721 ; SSE-NEXT: movdqa %xmm1, %xmm6
4722 ; SSE-NEXT: psrld $16, %xmm6
4723 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4724 ; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
4725 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
4726 ; SSE-NEXT: movdqa %xmm1, %xmm12
4727 ; SSE-NEXT: movdqa %xmm9, %xmm1
4728 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
4729 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4730 ; SSE-NEXT: psrlq $48, %xmm1
4731 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4732 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4733 ; SSE-NEXT: psrlq $16, %xmm1
4734 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4735 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
4736 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
4737 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4738 ; SSE-NEXT: movdqa %xmm2, %xmm12
4739 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
4740 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4741 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4742 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4743 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
4744 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,0]
4745 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
4746 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4747 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7]
4748 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
4749 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4750 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
4751 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,6,5,6,7]
4752 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4753 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4754 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
4755 ; SSE-NEXT: movdqa %xmm11, %xmm0
4756 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4757 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4758 ; SSE-NEXT: pand %xmm11, %xmm1
4759 ; SSE-NEXT: por %xmm0, %xmm1
4760 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
4761 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
4762 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4763 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4764 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4765 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2]
4766 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4767 ; SSE-NEXT: movdqa %xmm14, %xmm0
4768 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4769 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4770 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4771 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4772 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
4773 ; SSE-NEXT: movdqa %xmm11, %xmm0
4774 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4775 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4776 ; SSE-NEXT: pand %xmm11, %xmm1
4777 ; SSE-NEXT: por %xmm0, %xmm1
4778 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
4779 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
4780 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4781 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
4782 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4783 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2]
4784 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4785 ; SSE-NEXT: movdqa %xmm13, %xmm0
4786 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
4788 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4789 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4790 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
4791 ; SSE-NEXT: movdqa %xmm11, %xmm0
4792 ; SSE-NEXT: pandn (%rsp), %xmm0 # 16-byte Folded Reload
4793 ; SSE-NEXT: pand %xmm11, %xmm8
4794 ; SSE-NEXT: por %xmm0, %xmm8
4795 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,0,3]
4796 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
4797 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4798 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
4799 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4800 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm0[0,2]
4801 ; SSE-NEXT: pand %xmm11, %xmm2
4802 ; SSE-NEXT: pandn %xmm12, %xmm11
4803 ; SSE-NEXT: por %xmm2, %xmm11
4804 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4805 ; SSE-NEXT: movdqa %xmm12, %xmm0
4806 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4807 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
4808 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
4809 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
4810 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
4811 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,1,0,3]
4812 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
4813 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
4814 ; SSE-NEXT: movdqa %xmm3, %xmm11
4815 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
4816 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[0,2]
4817 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
4818 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4819 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4820 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4821 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
4822 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4823 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
4824 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
4825 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4826 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4827 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
4828 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4829 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
4830 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4831 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
4832 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
4833 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
4834 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
4835 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
4836 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4837 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
4838 ; SSE-NEXT: psrldq {{.*#+}} xmm14 = xmm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4839 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
4840 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
4841 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4842 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
4843 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
4844 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4845 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4846 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
4847 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4848 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4849 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3]
4850 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
4851 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
4852 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
4853 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
4854 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4855 ; SSE-NEXT: psrldq {{.*#+}} xmm13 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4856 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
4857 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
4858 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4859 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
4860 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
4861 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4862 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4863 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
4864 ; SSE-NEXT: movdqa (%rsp), %xmm3 # 16-byte Reload
4865 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4866 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,1,0,3]
4867 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
4868 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4869 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
4870 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
4871 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
4872 ; SSE-NEXT: psrldq {{.*#+}} xmm12 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
4873 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
4874 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
4875 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4876 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
4877 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
4878 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4879 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4880 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
4881 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4882 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4883 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,1,0,3]
4884 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
4885 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4886 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
4887 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
4888 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4889 ; SSE-NEXT: movaps %xmm1, (%rsi)
4890 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4891 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
4892 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4893 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
4894 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4895 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
4896 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4897 ; SSE-NEXT: movaps %xmm1, (%rdx)
4898 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4899 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
4900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4901 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
4902 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4903 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
4904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4905 ; SSE-NEXT: movaps %xmm1, (%rcx)
4906 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4907 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
4908 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4909 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
4910 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4911 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
4912 ; SSE-NEXT: movdqa %xmm15, (%r8)
4913 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4914 ; SSE-NEXT: movaps %xmm1, 48(%r8)
4915 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4916 ; SSE-NEXT: movaps %xmm1, 32(%r8)
4917 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4918 ; SSE-NEXT: movaps %xmm1, 16(%r8)
4919 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4920 ; SSE-NEXT: movaps %xmm1, (%r9)
4921 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4922 ; SSE-NEXT: movaps %xmm1, 48(%r9)
4923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4924 ; SSE-NEXT: movaps %xmm1, 32(%r9)
4925 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4926 ; SSE-NEXT: movaps %xmm1, 16(%r9)
4927 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4928 ; SSE-NEXT: movaps %xmm6, (%rax)
4929 ; SSE-NEXT: movaps %xmm7, 48(%rax)
4930 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4931 ; SSE-NEXT: movaps %xmm1, 32(%rax)
4932 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4933 ; SSE-NEXT: movaps %xmm1, 16(%rax)
4934 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4935 ; SSE-NEXT: movapd %xmm0, (%rax)
4936 ; SSE-NEXT: movapd %xmm3, 48(%rax)
4937 ; SSE-NEXT: movapd %xmm4, 32(%rax)
4938 ; SSE-NEXT: movapd %xmm5, 16(%rax)
4939 ; SSE-NEXT: addq $600, %rsp # imm = 0x258
4942 ; AVX-LABEL: load_i16_stride7_vf32:
4944 ; AVX-NEXT: subq $680, %rsp # imm = 0x2A8
4945 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm15
4946 ; AVX-NEXT: vpsrld $16, %xmm15, %xmm0
4947 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4948 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm8
4949 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[2,2,3,3]
4950 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4952 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
4953 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4954 ; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
4955 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm2
4956 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
4957 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
4958 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4959 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
4960 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4961 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm2
4962 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4963 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4964 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
4965 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
4966 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
4967 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0
4968 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4969 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
4970 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4971 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4972 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm2
4973 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4974 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
4975 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
4976 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm3
4977 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,3,2,3]
4978 ; AVX-NEXT: vmovdqa %xmm3, %xmm13
4979 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4980 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
4981 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4982 ; AVX-NEXT: vmovdqa (%rdi), %xmm3
4983 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4984 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
4985 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4986 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
4987 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
4988 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
4989 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
4990 ; AVX-NEXT: vmovaps 32(%rdi), %xmm6
4991 ; AVX-NEXT: vmovaps 48(%rdi), %xmm7
4992 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm6[2],xmm7[2],zero
4993 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4994 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4995 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
4996 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
4997 ; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
4998 ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
4999 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
5000 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5001 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
5002 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5003 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm1
5004 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5005 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
5006 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm2
5007 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5008 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
5009 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5010 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm9
5011 ; AVX-NEXT: vpsrlq $16, %xmm9, %xmm2
5012 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5013 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm3
5014 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5015 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
5016 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
5017 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm3
5018 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5019 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm2
5020 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5021 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
5022 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
5023 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5024 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
5025 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm2
5026 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5027 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
5028 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
5029 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm14
5030 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[2,2,3,3]
5031 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5032 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
5033 ; AVX-NEXT: vmovaps 256(%rdi), %xmm5
5034 ; AVX-NEXT: vmovaps 272(%rdi), %xmm11
5035 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm11[2],zero
5036 ; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5037 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5038 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
5039 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm4
5040 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5041 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
5042 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5043 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
5044 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm4
5045 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5046 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
5047 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
5048 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm4
5049 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
5051 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
5052 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
5053 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2
5054 ; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm0
5055 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
5056 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5057 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
5058 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5059 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm15[4],xmm8[5],xmm15[5],xmm8[6],xmm15[6],xmm8[7],xmm15[7]
5060 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
5061 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5062 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5063 ; AVX-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
5064 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,5],xmm10[6],xmm15[7]
5065 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
5066 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
5067 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5068 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5069 ; AVX-NEXT: vpslld $16, %xmm1, %xmm1
5070 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5071 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
5072 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
5073 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5074 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
5075 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5076 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
5077 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5078 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5079 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5080 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
5081 ; AVX-NEXT: vpsrld $16, %xmm13, %xmm2
5082 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
5083 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
5084 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
5085 ; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
5086 ; AVX-NEXT: vmovdqa %xmm3, %xmm4
5087 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5088 ; AVX-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
5089 ; AVX-NEXT: # xmm3 = xmm13[0],mem[1],xmm13[2,3,4,5,6,7]
5090 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
5091 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
5092 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
5093 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
5094 ; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
5095 ; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3
5096 ; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1
5097 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5098 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5099 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
5100 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
5101 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
5102 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5103 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5104 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5105 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
5106 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
5107 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
5108 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5109 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1,2,3,4,5],xmm7[6],xmm9[7]
5110 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
5111 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
5112 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5113 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5114 ; AVX-NEXT: vpslld $16, %xmm6, %xmm1
5115 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5116 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
5117 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
5118 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
5119 ; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
5120 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5121 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
5122 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
5123 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
5124 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3,4],xmm4[5,6,7]
5125 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5126 ; AVX-NEXT: vpsrld $16, %xmm14, %xmm4
5127 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5128 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
5129 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5130 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5131 ; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
5132 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
5133 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5134 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm5
5135 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
5136 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
5137 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
5138 ; AVX-NEXT: vandnps %ymm4, %ymm2, %ymm4
5139 ; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1
5140 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5141 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5142 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
5143 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
5144 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
5145 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5146 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5147 ; AVX-NEXT: vpsllq $16, %xmm0, %xmm0
5148 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5149 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5150 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,3,2,3]
5151 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
5152 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3],xmm15[4,5,6,7]
5153 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
5154 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
5155 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5156 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
5157 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
5158 ; AVX-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5159 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
5160 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5161 ; AVX-NEXT: # xmm0 = mem[0,1,2,3,4,5],xmm0[6,7]
5162 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5163 ; AVX-NEXT: # xmm4 = mem[0,1,0,1]
5164 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm4[7]
5165 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5166 ; AVX-NEXT: # xmm4 = mem[1,1,1,1]
5167 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
5168 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5169 ; AVX-NEXT: # xmm0 = mem[0,1,0,3]
5170 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
5171 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5172 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,2,3,3]
5173 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm0[1],xmm5[1]
5174 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5175 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
5176 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
5177 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
5178 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
5179 ; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
5180 ; AVX-NEXT: vandnps %ymm4, %ymm8, %ymm4
5181 ; AVX-NEXT: vandps %ymm5, %ymm8, %ymm5
5182 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4
5183 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5184 ; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5185 ; AVX-NEXT: vandnps %ymm1, %ymm13, %ymm1
5186 ; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4
5187 ; AVX-NEXT: vorps %ymm1, %ymm4, %ymm1
5188 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5189 ; AVX-NEXT: vpsllq $16, %xmm3, %xmm1
5190 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5191 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
5192 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[0,3,2,3]
5193 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
5194 ; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5195 ; AVX-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
5196 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
5197 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm9[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
5198 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
5199 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm4[6,7]
5200 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5201 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,1,0,3]
5202 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
5203 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5204 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,2,3,3]
5205 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm1[1],xmm5[1]
5206 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
5207 ; AVX-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3]
5208 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,1,2,3]
5209 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
5210 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3,4,5,6,7]
5211 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[2,2,2,2]
5212 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,5],xmm15[6,7]
5213 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5214 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,0,1]
5215 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
5216 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5217 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
5218 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
5219 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
5220 ; AVX-NEXT: vandps %ymm3, %ymm5, %ymm5
5221 ; AVX-NEXT: vandnps %ymm14, %ymm3, %ymm14
5222 ; AVX-NEXT: vorps %ymm5, %ymm14, %ymm5
5223 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
5224 ; AVX-NEXT: vandnps %ymm4, %ymm13, %ymm4
5225 ; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5
5226 ; AVX-NEXT: vmovaps %ymm13, %ymm6
5227 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4
5228 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5229 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5230 ; AVX-NEXT: vpunpcklwd (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
5231 ; AVX-NEXT: # xmm4 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
5232 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
5233 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
5234 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5235 ; AVX-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
5236 ; AVX-NEXT: # xmm5 = mem[0,1,2,3,4,5],xmm3[6],mem[7]
5237 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,0,0,4,5,6,7]
5238 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,6,7]
5239 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm4[1,2],xmm5[3,4,5,6,7]
5240 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5241 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
5242 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
5243 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
5244 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
5245 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
5246 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
5247 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5248 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1],xmm2[2,3,4,5,6,7]
5249 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
5250 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
5251 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3,4,5,6,7]
5252 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5253 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
5254 ; AVX-NEXT: # xmm5 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
5255 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
5256 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
5257 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
5258 ; AVX-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
5259 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
5260 ; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm15
5261 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5262 ; AVX-NEXT: vpsrlq $48, %xmm12, %xmm13
5263 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13
5264 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
5265 ; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
5266 ; AVX-NEXT: vandnps %ymm13, %ymm15, %ymm13
5267 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0
5268 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm13
5269 ; AVX-NEXT: vandnps %ymm13, %ymm6, %ymm13
5270 ; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0
5271 ; AVX-NEXT: vorps %ymm0, %ymm13, %ymm0
5272 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5273 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5274 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm0 # 16-byte Folded Reload
5275 ; AVX-NEXT: # xmm0 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
5276 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
5277 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,3,4,5,6,7]
5278 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5279 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm13 # 16-byte Folded Reload
5280 ; AVX-NEXT: # xmm13 = xmm6[0,1,2,3,4,5],mem[6],xmm6[7]
5281 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,0,0,0,4,5,6,7]
5282 ; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,6,7]
5283 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm0[1,2],xmm13[3,4,5,6,7]
5284 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5285 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5286 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5287 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[2,1,2,3,4,5,6,7]
5288 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
5289 ; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
5290 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
5291 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
5292 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
5293 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
5294 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
5295 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3,4,5,6,7]
5296 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5297 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
5298 ; AVX-NEXT: # xmm14 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
5299 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
5300 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
5301 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
5302 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm5
5303 ; AVX-NEXT: vpsrlq $48, %xmm11, %xmm14
5304 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
5305 ; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
5306 ; AVX-NEXT: vandnps %ymm5, %ymm15, %ymm2
5307 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
5308 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm2
5309 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5310 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
5311 ; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
5312 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
5313 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5314 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5315 ; AVX-NEXT: # xmm1 = mem[1,1,1,1]
5316 ; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
5317 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3],xmm1[4,5,6,7]
5318 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5319 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[0,3,2,3]
5320 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
5321 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5322 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
5323 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3,4,5,6,7]
5324 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,2,1]
5325 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
5326 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
5327 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5328 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1
5329 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5330 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[2,3,2,3]
5331 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5332 ; AVX-NEXT: vpsrlq $16, %xmm10, %xmm4
5333 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5334 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
5335 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3],xmm1[4,5,6,7]
5336 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5337 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,1,0,3]
5338 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
5339 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5340 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
5341 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
5342 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5343 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
5344 ; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
5345 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,3,2,3]
5346 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
5347 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
5348 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5349 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5350 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
5351 ; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4
5352 ; AVX-NEXT: vmovaps %ymm5, %ymm13
5353 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2
5354 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5355 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
5356 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5357 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
5358 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5359 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,3,2,3]
5360 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
5361 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
5362 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
5363 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5364 ; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
5365 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7]
5366 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5367 ; AVX-NEXT: vpsrlq $48, %xmm10, %xmm2
5368 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5369 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[2,3,2,3]
5370 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
5371 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5372 ; AVX-NEXT: vpsrlq $16, %xmm3, %xmm4
5373 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5374 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
5375 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
5376 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5377 ; AVX-NEXT: # xmm4 = mem[0,1,0,3]
5378 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
5379 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5380 ; AVX-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5381 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5382 ; AVX-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
5383 ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
5384 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5385 ; AVX-NEXT: # xmm4 = mem[2,3,2,3]
5386 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
5387 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
5388 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5389 ; AVX-NEXT: vandnps %ymm0, %ymm13, %ymm0
5390 ; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1
5391 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
5392 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5393 ; AVX-NEXT: vpsrlq $48, %xmm11, %xmm0
5394 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5395 ; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5396 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5397 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
5398 ; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
5399 ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1
5400 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
5401 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5402 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5403 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3,4,5],xmm11[6],xmm13[7]
5404 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5405 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
5406 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
5407 ; AVX-NEXT: vpsrld $16, %xmm7, %xmm1
5408 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5409 ; AVX-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5410 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5411 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
5412 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
5413 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
5414 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
5415 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload
5416 ; AVX-NEXT: # xmm2 = mem[0],xmm14[1],mem[2,3,4,5,6,7]
5417 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
5418 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,7]
5419 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5420 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
5421 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
5422 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
5423 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5424 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5425 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
5426 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
5427 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5428 ; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5429 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
5430 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
5431 ; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
5432 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5433 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5434 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
5435 ; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1
5436 ; AVX-NEXT: vpsrlq $48, %xmm12, %xmm2
5437 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5438 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5439 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5440 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
5441 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5442 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5443 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3,4,5],xmm0[6],xmm7[7]
5444 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
5445 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
5446 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
5447 ; AVX-NEXT: vpsrld $16, %xmm6, %xmm2
5448 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
5449 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5450 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
5451 ; AVX-NEXT: vmovdqa %xmm3, %xmm10
5452 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
5453 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
5454 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6,7]
5455 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5456 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5457 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm15[1],xmm9[2,3,4,5,6,7]
5458 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
5459 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
5460 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5461 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
5462 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
5463 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
5464 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5465 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
5466 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
5467 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
5468 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5469 ; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5470 ; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
5471 ; AVX-NEXT: vandps %ymm4, %ymm2, %ymm2
5472 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
5473 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5474 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5475 ; AVX-NEXT: # xmm2 = zero,xmm2[1],mem[0],zero
5476 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5477 ; AVX-NEXT: vpunpckhdq (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
5478 ; AVX-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
5479 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
5480 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm13[1],xmm11[2,3,4,5,6,7]
5481 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
5482 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
5483 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
5484 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5485 ; AVX-NEXT: # xmm4 = mem[1,1,1,1]
5486 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5487 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5488 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5489 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5490 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5491 ; AVX-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
5492 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
5493 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
5494 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
5495 ; AVX-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5496 ; AVX-NEXT: # xmm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
5497 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5498 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
5499 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,0,3]
5500 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
5501 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
5502 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5503 ; AVX-NEXT: # xmm12 = mem[3,3,3,3]
5504 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5
5505 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
5506 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5507 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
5508 ; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
5509 ; AVX-NEXT: vandps %ymm5, %ymm4, %ymm4
5510 ; AVX-NEXT: vmovaps %ymm5, %ymm11
5511 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2
5512 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5513 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
5514 ; AVX-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
5515 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5516 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
5517 ; AVX-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
5518 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
5519 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
5520 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
5521 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
5522 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
5523 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,1,1,1]
5524 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
5525 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
5526 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5527 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
5528 ; AVX-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5529 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
5530 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
5531 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
5532 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm6 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
5533 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
5534 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,1,0,3]
5535 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
5536 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
5537 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[3,3,3,3]
5538 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
5539 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
5540 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
5541 ; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm4
5542 ; AVX-NEXT: vandps %ymm5, %ymm11, %ymm3
5543 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
5544 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5545 ; AVX-NEXT: vmovaps %ymm0, 32(%rsi)
5546 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5547 ; AVX-NEXT: vmovaps %ymm4, (%rsi)
5548 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5549 ; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
5550 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5551 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
5552 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5553 ; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
5554 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5555 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
5556 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5557 ; AVX-NEXT: vmovaps %ymm0, 32(%r8)
5558 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5559 ; AVX-NEXT: vmovaps %ymm0, (%r8)
5560 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5561 ; AVX-NEXT: vmovaps %ymm0, 32(%r9)
5562 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5563 ; AVX-NEXT: vmovaps %ymm0, (%r9)
5564 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5565 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
5566 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5567 ; AVX-NEXT: vmovaps %ymm0, (%rax)
5568 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5569 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
5570 ; AVX-NEXT: vmovaps %ymm2, (%rax)
5571 ; AVX-NEXT: addq $680, %rsp # imm = 0x2A8
5572 ; AVX-NEXT: vzeroupper
5575 ; AVX2-LABEL: load_i16_stride7_vf32:
5577 ; AVX2-NEXT: subq $520, %rsp # imm = 0x208
5578 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm10
5579 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm9
5580 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm5
5581 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm13
5582 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
5583 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
5584 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11
5585 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
5586 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
5587 ; AVX2-NEXT: vmovdqa %ymm1, %ymm8
5588 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
5589 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
5590 ; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm2
5591 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
5592 ; AVX2-NEXT: vmovdqa %ymm4, %ymm6
5593 ; AVX2-NEXT: vmovdqa %ymm3, %ymm7
5594 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
5595 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
5596 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
5597 ; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm3
5598 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm12 = [65535,65535,65535,65535,65535,0,0,0]
5599 ; AVX2-NEXT: vpblendvb %ymm12, %ymm3, %ymm2, %ymm1
5600 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5601 ; AVX2-NEXT: vmovdqa %ymm5, %ymm3
5602 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
5603 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
5604 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
5605 ; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
5606 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
5607 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
5608 ; AVX2-NEXT: vpshufb %ymm0, %ymm4, %ymm0
5609 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
5610 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5611 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
5612 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
5613 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
5614 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
5615 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5616 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7]
5617 ; AVX2-NEXT: vmovdqa %ymm13, %ymm1
5618 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
5619 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
5620 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
5621 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
5622 ; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm0, %ymm0
5623 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5624 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm11[2],ymm8[3,4],ymm11[5],ymm8[6,7]
5625 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
5626 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
5627 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
5628 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
5629 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
5630 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
5631 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5632 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm0
5633 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5634 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,3,0,1]
5635 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm11[2,3],ymm8[4,5],ymm11[6,7]
5636 ; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5637 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5638 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
5639 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
5640 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5641 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
5642 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
5643 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
5644 ; AVX2-NEXT: vpshufb %ymm15, %ymm0, %ymm0
5645 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
5646 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
5647 ; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm0, %ymm13
5648 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7]
5649 ; AVX2-NEXT: vmovdqa %ymm1, %ymm4
5650 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
5651 ; AVX2-NEXT: vmovdqa %ymm3, %ymm1
5652 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5653 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
5654 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
5655 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5656 ; AVX2-NEXT: vmovdqa %ymm9, %ymm3
5657 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
5659 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm10[2,3,0,1]
5660 ; AVX2-NEXT: vmovdqa %ymm10, %ymm14
5661 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5662 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
5663 ; AVX2-NEXT: vpshufb %ymm15, %ymm2, %ymm2
5664 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm10
5665 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
5666 ; AVX2-NEXT: vmovdqa %ymm7, %ymm15
5667 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5668 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
5669 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5670 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7]
5671 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5672 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
5673 ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
5674 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
5675 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
5676 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm9
5677 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7]
5678 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
5679 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
5680 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
5681 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
5682 ; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
5683 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
5684 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
5685 ; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm8
5686 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm7
5687 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2
5688 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm7[3],ymm2[4,5],ymm7[6],ymm2[7]
5689 ; AVX2-NEXT: vmovdqa %ymm2, %ymm11
5690 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
5691 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
5692 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
5693 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
5694 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
5695 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
5696 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm5
5697 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,0,2]
5698 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,1,4,5,6,5]
5699 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5700 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
5701 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
5702 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
5703 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5704 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6
5705 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4
5706 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
5707 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,6,4,7]
5708 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
5709 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
5710 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
5711 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
5712 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
5713 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm3
5714 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,0,2]
5715 ; AVX2-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,1,4,5,6,5]
5716 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5717 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
5718 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5719 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
5720 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5721 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4,5],ymm4[6],ymm6[7]
5722 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm14
5723 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5],xmm14[6],xmm1[7]
5724 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
5725 ; AVX2-NEXT: vpshufb %xmm14, %xmm1, %xmm1
5726 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5727 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
5728 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
5729 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5730 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5731 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5732 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5733 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5734 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm11[2],ymm7[3,4,5],ymm11[6],ymm7[7]
5735 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5736 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
5737 ; AVX2-NEXT: vpshufb %xmm14, %xmm0, %xmm0
5738 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5739 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
5740 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
5741 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5742 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5743 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
5744 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5745 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5746 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
5747 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5748 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
5749 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
5750 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5751 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5752 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,2]
5753 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
5754 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5755 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
5756 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
5757 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
5758 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5759 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
5760 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
5761 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
5762 ; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
5763 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5764 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,1,2]
5765 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
5766 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
5767 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5768 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
5769 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
5770 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5771 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
5772 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5773 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
5774 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
5775 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
5776 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5777 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3]
5778 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[0,1,2,0,4,5,6,4]
5779 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
5780 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
5781 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
5782 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
5783 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5784 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5,6,7]
5785 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
5786 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
5787 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
5788 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
5789 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
5790 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,1,1,3]
5791 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
5792 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
5793 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
5794 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
5795 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
5796 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5797 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7]
5798 ; AVX2-NEXT: vmovdqa %ymm7, %ymm12
5799 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5800 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
5801 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
5802 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
5803 ; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm3
5804 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5805 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,1,4,5,6,5]
5806 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5807 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5808 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
5809 ; AVX2-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4],ymm15[5],mem[6,7]
5810 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
5811 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
5812 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
5813 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
5814 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
5815 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5816 ; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
5817 ; AVX2-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7]
5818 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
5819 ; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm8
5820 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
5821 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
5822 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6],ymm8[7,8,9,10,11,12,13],ymm5[14],ymm8[15]
5823 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
5824 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5825 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5826 ; AVX2-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload
5827 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5828 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
5829 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
5830 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
5831 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
5832 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
5833 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
5834 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5835 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5836 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
5837 ; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm5
5838 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
5839 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
5840 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5841 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7]
5842 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm7
5843 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
5844 ; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1
5845 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5846 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
5847 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
5848 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5849 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
5850 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5851 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5852 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
5853 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
5854 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
5855 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7]
5856 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
5857 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
5858 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
5859 ; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
5860 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
5861 ; AVX2-NEXT: vmovdqa 432(%rdi), %xmm3
5862 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm0
5863 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
5864 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
5865 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
5866 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5867 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
5868 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm13[2,3],ymm8[4,5],ymm13[6,7]
5869 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
5870 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
5871 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
5872 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
5873 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
5874 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
5875 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
5876 ; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm1
5877 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
5878 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
5879 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5880 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5881 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
5882 ; AVX2-NEXT: vmovdqa %ymm11, %ymm15
5883 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
5884 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
5885 ; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5886 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
5887 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm11
5888 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1
5889 ; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
5890 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
5891 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6]
5892 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
5893 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm12[5,6,7],ymm5[8,9,10,11,12],ymm12[13,14,15]
5894 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5895 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm14[3],ymm7[4,5],ymm14[6],ymm7[7]
5896 ; AVX2-NEXT: vmovdqa %ymm14, %ymm13
5897 ; AVX2-NEXT: vpermq {{.*#+}} ymm14 = ymm12[2,3,0,1]
5898 ; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3,4,5,6],ymm14[7,8],ymm12[9,10,11,12,13,14],ymm14[15]
5899 ; AVX2-NEXT: vpshufb %ymm8, %ymm12, %ymm8
5900 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5901 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5902 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm2[2,3],ymm9[4,5],ymm2[6,7]
5903 ; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm14
5904 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
5905 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
5906 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
5907 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
5908 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
5909 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15]
5910 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
5911 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
5912 ; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
5913 ; AVX2-NEXT: # ymm8 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5,6,7]
5914 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
5915 ; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm10[1],ymm8[2,3,4,5,6,7,8],ymm10[9],ymm8[10,11,12,13,14,15]
5916 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
5917 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
5918 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
5919 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
5920 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
5921 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
5922 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5923 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
5924 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
5925 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5926 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
5927 ; AVX2-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
5928 ; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
5929 ; AVX2-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
5930 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
5931 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
5932 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5933 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
5934 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
5935 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
5936 ; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm8
5937 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
5938 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
5939 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5940 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
5941 ; AVX2-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
5942 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm8
5943 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
5944 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
5945 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7]
5946 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5947 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
5948 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
5949 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5950 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15]
5951 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6,7]
5952 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
5953 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
5954 ; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
5955 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm2[3],ymm9[4,5],ymm2[6],ymm9[7]
5956 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,6,4,6,7]
5957 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
5958 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
5959 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
5960 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
5961 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
5962 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5963 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
5964 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5965 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
5966 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5967 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
5968 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5969 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
5970 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5971 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
5972 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5973 ; AVX2-NEXT: vmovaps %ymm2, 32(%rcx)
5974 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5975 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
5976 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5977 ; AVX2-NEXT: vmovaps %ymm2, 32(%r8)
5978 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5979 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
5980 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5981 ; AVX2-NEXT: vmovaps %ymm2, 32(%r9)
5982 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5983 ; AVX2-NEXT: vmovaps %ymm2, (%r9)
5984 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5985 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5986 ; AVX2-NEXT: vmovaps %ymm2, 32(%rax)
5987 ; AVX2-NEXT: vmovdqa %ymm5, (%rax)
5988 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
5989 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
5990 ; AVX2-NEXT: vmovdqa %ymm1, (%rax)
5991 ; AVX2-NEXT: addq $520, %rsp # imm = 0x208
5992 ; AVX2-NEXT: vzeroupper
5995 ; AVX2-FP-LABEL: load_i16_stride7_vf32:
5997 ; AVX2-FP-NEXT: subq $552, %rsp # imm = 0x228
5998 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm11
5999 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10
6000 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5
6001 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm12
6002 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
6003 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
6004 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm6
6005 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
6006 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
6007 ; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm9
6008 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
6009 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
6010 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm2
6011 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
6012 ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm7
6013 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm8
6014 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3
6015 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
6016 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
6017 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm3
6018 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
6019 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm3, %ymm2, %ymm1
6020 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
6021 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm1
6022 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm5[2],ymm12[3,4,5],ymm5[6],ymm12[7]
6023 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5
6024 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6],xmm2[7]
6025 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
6026 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
6027 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
6028 ; AVX2-FP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
6029 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
6030 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6031 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
6032 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6033 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
6034 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
6035 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
6036 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7]
6037 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
6038 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
6039 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
6040 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
6041 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm4, %ymm0, %ymm0
6042 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6043 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7]
6044 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
6045 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
6046 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
6047 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
6048 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
6049 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
6050 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
6051 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
6052 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6053 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm6[2,3,0,1]
6054 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm6[2,3],ymm9[4,5],ymm6[6,7]
6055 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6056 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6057 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
6058 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
6059 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6060 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm3
6061 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
6062 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
6063 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
6064 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
6065 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
6066 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
6067 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm2, %ymm0, %ymm0
6068 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6069 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6,7]
6070 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6071 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6072 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
6073 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6074 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
6075 ; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm13
6076 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6077 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm11[2,3],ymm10[4,5],ymm11[6,7]
6078 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm11[2,3,0,1]
6079 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6080 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7,8,9,10,11],ymm5[12],ymm2[13,14,15]
6081 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
6082 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm7
6083 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
6084 ; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm15
6085 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6086 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
6087 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
6088 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
6089 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
6090 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
6091 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
6092 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
6093 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
6094 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm10
6095 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7]
6096 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
6097 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
6098 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
6099 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
6100 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
6101 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
6102 ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm0, %ymm2, %ymm8
6103 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm6
6104 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2
6105 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm6[3],ymm2[4,5],ymm6[6],ymm2[7]
6106 ; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm11
6107 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
6108 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
6109 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
6110 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
6111 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
6112 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
6113 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm5
6114 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm5[0,1,0,2]
6115 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6116 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm4
6117 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
6118 ; AVX2-FP-NEXT: vpblendd $31, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
6119 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
6120 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
6121 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm9
6122 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4
6123 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
6124 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm14
6125 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm14, %xmm3
6126 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
6127 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
6128 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
6129 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm1
6130 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm1[0,1,0,2]
6131 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm13
6132 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7]
6133 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
6134 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm3[5,6,7]
6135 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6136 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm4[2],ymm9[3,4,5],ymm4[6],ymm9[7]
6137 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm13
6138 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm13[4],xmm3[5],xmm13[6],xmm3[7]
6139 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
6140 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
6141 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6142 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
6143 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm14, %ymm14
6144 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
6145 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6146 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm3[1,2,3,4,5,6,7],ymm12[8],ymm3[9,10,11,12,13,14,15]
6147 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3],ymm3[4,5,6,7]
6148 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
6150 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm14
6151 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
6152 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
6153 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
6154 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
6155 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
6156 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6157 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
6158 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6159 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6160 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
6161 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
6162 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
6163 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
6164 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6165 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6166 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,1,1,2]
6167 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
6168 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
6169 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
6170 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6171 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
6172 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6173 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6174 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm4[3],ymm9[4,5],ymm4[6],ymm9[7]
6175 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
6176 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
6177 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6178 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,1,2]
6179 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
6180 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6181 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
6182 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
6183 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6184 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6185 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5,6,7]
6186 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
6187 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
6188 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
6189 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6190 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6191 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,1,3]
6192 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
6193 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm7, %ymm5
6194 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
6195 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1,2,3,4,5,6,7],ymm10[8],ymm0[9,10,11,12,13,14,15]
6196 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
6197 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6198 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm9[1],ymm4[2,3],ymm9[4],ymm4[5,6,7]
6199 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm5
6200 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
6201 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
6202 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6203 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,1,1,3]
6204 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm1
6205 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6206 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
6207 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
6208 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6209 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
6210 ; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
6211 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
6212 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm3
6213 ; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm14
6214 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
6215 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
6216 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6217 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6218 ; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
6219 ; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7]
6220 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
6221 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm8
6222 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
6223 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15]
6224 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
6225 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm6[1],ymm11[2,3,4],ymm6[5],ymm11[6,7]
6226 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6227 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6228 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm10
6229 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
6230 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
6231 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
6232 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6233 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6234 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
6235 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7]
6236 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3,4,5,6,7],ymm8[8],ymm3[9,10,11,12,13,14,15]
6237 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6238 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6239 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6240 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6241 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
6242 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
6243 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
6244 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
6245 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6246 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6247 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6248 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm1[2],ymm8[3,4,5],ymm1[6],ymm8[7]
6249 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
6250 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
6251 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6],ymm5[7,8,9,10,11,12,13],ymm3[14],ymm5[15]
6252 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
6253 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
6254 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm7
6255 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
6256 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
6257 ; AVX2-FP-NEXT: vpshufb %ymm13, %ymm0, %ymm0
6258 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6259 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
6260 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
6261 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6262 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6263 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm1[3],ymm8[4,5],ymm1[6],ymm8[7]
6264 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
6265 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1,2,3,4,5,6],ymm2[7,8],ymm0[9,10,11,12,13,14],ymm2[15]
6266 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
6267 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
6268 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4],xmm3[5,6,7]
6269 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
6270 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
6271 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
6272 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm3
6273 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm0
6274 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
6275 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
6276 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
6277 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6278 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
6279 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
6280 ; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14
6281 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
6282 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
6283 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
6284 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
6285 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
6286 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
6287 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
6288 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3]
6289 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
6290 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6291 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm6[2],ymm11[3,4],ymm6[5],ymm11[6,7]
6292 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
6293 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
6294 ; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm10
6295 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm15
6296 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12
6297 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
6298 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm5
6299 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
6300 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
6301 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7],ymm10[8,9,10,11,12],ymm5[13,14,15]
6302 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6303 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6304 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm7[3],ymm11[4,5],ymm7[6],ymm11[7]
6305 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
6306 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1,2,3,4,5,6],ymm13[7,8],ymm10[9,10,11,12,13,14],ymm13[15]
6307 ; AVX2-FP-NEXT: vpshufb %ymm14, %ymm10, %ymm10
6308 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
6309 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6310 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
6311 ; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
6312 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
6313 ; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
6314 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
6315 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
6316 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
6317 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
6318 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6319 ; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
6320 ; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5,6,7]
6321 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
6322 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4,5,6,7,8],ymm10[9],ymm5[10,11,12,13,14,15]
6323 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
6324 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
6325 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
6326 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
6327 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
6328 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
6329 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6330 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
6331 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
6332 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6333 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
6334 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6335 ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6336 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
6337 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm10
6338 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
6339 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm10
6340 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
6341 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
6342 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
6343 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm5, %ymm5
6344 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
6345 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
6346 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6347 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6348 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
6349 ; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
6350 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
6351 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
6352 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
6353 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
6354 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
6355 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6356 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6357 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
6358 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5,6,7]
6359 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
6360 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
6361 ; AVX2-FP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
6362 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm8[3],ymm2[4,5],ymm8[6],ymm2[7]
6363 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
6364 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
6365 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
6366 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
6367 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
6368 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
6369 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6370 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6371 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
6372 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
6373 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
6374 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6375 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
6376 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6377 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
6378 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6379 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
6380 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6381 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
6382 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6383 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8)
6384 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6385 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
6386 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6387 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r9)
6388 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6389 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r9)
6390 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6391 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6392 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax)
6393 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
6394 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6395 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
6396 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax)
6397 ; AVX2-FP-NEXT: addq $552, %rsp # imm = 0x228
6398 ; AVX2-FP-NEXT: vzeroupper
6399 ; AVX2-FP-NEXT: retq
6401 ; AVX2-FCP-LABEL: load_i16_stride7_vf32:
6402 ; AVX2-FCP: # %bb.0:
6403 ; AVX2-FCP-NEXT: subq $648, %rsp # imm = 0x288
6404 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
6405 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8
6406 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
6407 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6
6408 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
6409 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
6410 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
6411 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
6412 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
6413 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm13
6414 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm14
6415 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6416 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
6417 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm2
6418 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
6419 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm11
6420 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12
6421 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
6422 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
6423 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
6424 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
6425 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
6426 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm15
6427 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
6428 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm9
6429 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm10
6430 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
6431 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6],xmm2[7]
6432 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
6433 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm6
6434 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
6435 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
6436 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
6437 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
6438 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6439 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
6440 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,5,1,0,4,0,0,0]
6441 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
6442 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
6443 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
6444 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
6445 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
6446 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
6447 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
6448 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
6449 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm1
6450 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6451 ; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm7
6452 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
6453 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
6454 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
6455 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
6456 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
6457 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2
6458 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6459 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6460 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6461 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
6462 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6463 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
6465 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6466 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6467 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
6468 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6469 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,1,0,5,0,0,0]
6470 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
6471 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
6472 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
6473 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
6474 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
6475 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm9
6476 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
6477 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6478 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6479 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
6480 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
6481 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
6482 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6483 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm5
6484 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6485 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7]
6486 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
6487 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
6488 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm6
6489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
6490 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
6491 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
6492 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
6493 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
6494 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
6495 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6496 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
6497 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
6498 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
6499 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6500 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
6501 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
6502 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
6503 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
6504 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
6505 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[1,3,2,3]
6506 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
6507 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm8
6508 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
6509 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
6510 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
6511 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
6512 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm7
6513 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [3,6,2,5,3,6,2,5]
6514 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,0,1]
6515 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm2
6516 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
6517 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm12
6518 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
6519 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,0,2]
6520 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
6521 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm11
6522 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10
6523 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
6524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm11[5,6,7]
6525 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6526 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm2
6527 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
6528 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
6529 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm12
6530 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm11
6531 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm13, %ymm13
6532 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1
6533 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
6534 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm2[0,1,0,2]
6535 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3
6536 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm15
6537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
6538 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6539 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
6540 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6541 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm10
6542 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
6543 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm15
6544 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5],xmm15[6],xmm1[7]
6545 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
6546 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
6547 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6548 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
6549 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
6550 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6551 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6552 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6553 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6554 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6555 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7]
6556 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
6557 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
6558 ; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0
6559 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm1
6560 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6562 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6563 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
6564 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6565 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6566 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
6567 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm7
6568 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
6569 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
6570 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
6571 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6572 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6573 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
6574 ; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
6575 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm14
6576 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6577 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
6578 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
6579 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
6580 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6581 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
6582 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm14
6583 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
6584 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0],xmm0[1],xmm9[2,3,4,5],xmm0[6],xmm9[7]
6585 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6586 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6587 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm1
6589 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6590 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6591 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
6592 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6593 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6594 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
6595 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
6596 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
6597 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
6598 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6599 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6600 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[0,1,1,3]
6601 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
6602 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm9
6603 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
6604 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6605 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
6606 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
6607 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6608 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
6609 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12
6610 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
6611 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
6612 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
6613 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6614 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,1,3]
6615 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm1
6616 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6617 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
6618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
6619 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6620 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
6621 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6622 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
6623 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
6624 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
6625 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
6626 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6627 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7]
6628 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6629 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6630 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8
6631 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2],xmm2[3],xmm8[4,5,6,7]
6632 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
6633 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6634 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
6635 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm6[7]
6636 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6637 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6638 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7]
6639 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,2,6,0,0,0]
6640 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm9, %ymm6
6641 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
6642 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm6
6643 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
6644 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
6645 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6646 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6647 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6648 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6649 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
6650 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
6651 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
6652 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
6653 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
6654 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6655 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
6656 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7]
6657 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm2
6658 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7]
6659 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9
6660 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
6661 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
6662 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27]
6663 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
6665 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
6666 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
6667 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
6668 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6669 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6670 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
6671 ; AVX2-FCP-NEXT: vpermd (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
6672 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
6673 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
6674 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [2,5,1,4,2,5,1,4]
6675 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
6676 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
6677 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
6678 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
6679 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
6680 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
6681 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
6682 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15
6683 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
6684 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
6685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
6686 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9
6687 ; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm7
6688 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0]
6689 ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm15
6690 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
6691 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm15
6692 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15]
6693 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3]
6694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
6695 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6696 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
6697 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,3,0,3,7,7]
6698 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
6699 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
6700 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm11
6701 ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm12
6702 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
6703 ; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8
6704 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
6705 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7],ymm8[8,9,10,11,12],ymm1[13,14,15]
6706 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
6707 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm2
6708 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
6709 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
6710 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
6711 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
6712 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8
6713 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
6714 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
6715 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
6716 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
6717 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6718 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6719 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6720 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6721 ; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
6722 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7]
6723 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
6724 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
6725 ; AVX2-FCP-NEXT: vpermd (%rsp), %ymm2, %ymm3 # 32-byte Folded Reload
6726 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
6727 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
6728 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5]
6729 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6730 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0
6731 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
6732 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
6733 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
6734 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
6735 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6736 ; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
6737 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
6738 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
6739 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
6740 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
6741 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
6742 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
6743 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,4,0,3,7,0,0,0]
6744 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3
6745 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
6746 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
6747 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
6748 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
6749 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6750 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm2
6751 ; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
6752 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
6753 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3
6754 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
6755 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
6756 ; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
6757 ; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5,6,7]
6758 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3
6759 ; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
6760 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm14[3],ymm15[4,5],ymm14[6],ymm15[7]
6761 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
6762 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
6763 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
6764 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
6765 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
6766 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
6767 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6768 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6769 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
6770 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6771 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
6772 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6773 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
6774 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6775 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
6776 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6777 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
6778 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6779 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
6780 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6781 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
6782 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6783 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
6784 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6785 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
6786 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6787 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
6788 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6789 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6790 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
6791 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6792 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
6793 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6794 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax)
6795 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
6796 ; AVX2-FCP-NEXT: addq $648, %rsp # imm = 0x288
6797 ; AVX2-FCP-NEXT: vzeroupper
6798 ; AVX2-FCP-NEXT: retq
6800 ; AVX512-LABEL: load_i16_stride7_vf32:
6802 ; AVX512-NEXT: vmovdqa (%rdi), %ymm2
6803 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
6804 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm12
6805 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm11
6806 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
6807 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
6808 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6809 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
6810 ; AVX512-NEXT: vmovdqa %ymm3, %ymm4
6811 ; AVX512-NEXT: vmovdqa %ymm2, %ymm8
6812 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
6813 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
6814 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
6815 ; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21
6816 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
6817 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm10
6818 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
6819 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
6820 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
6821 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
6822 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
6823 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
6824 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
6825 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
6826 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
6827 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5]
6828 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6829 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
6830 ; AVX512-NEXT: vpbroadcastw 252(%rdi), %xmm3
6831 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6
6832 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3]
6833 ; AVX512-NEXT: vmovdqa %xmm6, %xmm13
6834 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
6835 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6836 ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0
6837 ; AVX512-NEXT: kmovw %eax, %k1
6838 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm21 {%k1}
6839 ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm5
6840 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6
6841 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14
6842 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
6843 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
6844 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
6845 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6846 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
6847 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
6848 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
6849 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6850 ; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
6851 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6852 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
6853 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
6854 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
6855 ; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
6856 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6857 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
6858 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
6859 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
6860 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
6861 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
6862 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
6863 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
6864 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
6865 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
6866 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
6867 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6868 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3],ymm8[4],ymm4[5,6,7]
6869 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
6870 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
6871 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6872 ; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm19
6873 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
6874 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
6875 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
6876 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
6877 ; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
6878 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
6879 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6880 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
6881 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
6882 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
6883 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
6884 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
6885 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
6886 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
6887 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
6888 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm23
6889 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
6890 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
6891 ; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
6892 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3,4],ymm8[5],ymm4[6,7]
6893 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm25
6894 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26
6895 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
6896 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
6897 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
6898 ; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm20
6899 ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,1,3]
6900 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
6901 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6902 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
6903 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
6904 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6905 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[0,1,2,1,4,5,6,5]
6906 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
6907 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
6908 ; AVX512-NEXT: vpbroadcastw 232(%rdi), %xmm1
6909 ; AVX512-NEXT: vpsrlq $48, %xmm14, %xmm2
6910 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6911 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18
6912 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
6913 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
6914 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
6915 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
6916 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6917 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm2
6918 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm1
6919 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
6920 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
6921 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
6922 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6923 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
6924 ; AVX512-NEXT: vpsrld $16, %xmm13, %xmm3
6925 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm31
6926 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
6927 ; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16
6928 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm17
6929 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
6930 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
6931 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm0[3],xmm3[4],xmm0[5],xmm3[6,7]
6932 ; AVX512-NEXT: vmovdqa 352(%rdi), %ymm15
6933 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm4
6934 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm15[3],ymm4[4,5],ymm15[6],ymm4[7]
6935 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
6936 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6],ymm8[7,8,9,10,11,12,13],ymm13[14],ymm8[15]
6937 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,1]
6938 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
6939 ; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
6940 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
6941 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
6942 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm8
6943 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm13
6944 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
6945 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
6946 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
6947 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
6948 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
6949 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
6950 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6951 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6952 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29
6953 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
6954 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
6955 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
6956 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7]
6957 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
6958 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6],ymm7[7,8],ymm3[9,10,11,12,13,14],ymm7[15]
6959 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
6960 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
6961 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3,4,5,6],xmm3[7]
6962 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
6963 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7]
6964 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
6965 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
6966 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
6967 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
6968 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
6969 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
6970 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6971 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
6972 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30
6973 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3,4],ymm4[5],ymm15[6,7]
6974 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
6975 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
6976 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6,7,8],ymm3[9],ymm0[10,11,12,13,14,15]
6977 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
6978 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
6979 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6],xmm3[7]
6980 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7
6981 ; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3
6982 ; AVX512-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
6983 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7]
6984 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
6985 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
6986 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
6987 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,7]
6988 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
6989 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6990 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6991 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28
6992 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
6993 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
6994 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
6995 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
6996 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
6997 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6998 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
6999 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
7000 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
7001 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
7002 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
7003 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
7004 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm27
7005 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
7006 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
7007 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
7008 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm13[2],ymm8[3,4,5],ymm13[6],ymm8[7]
7009 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm14
7010 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm14[4],xmm3[5],xmm14[6],xmm3[7]
7011 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
7012 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm4[2],ymm15[3,4],ymm4[5],ymm15[6,7]
7013 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[1,1,2,0]
7014 ; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7015 ; AVX512-NEXT: vpor %ymm0, %ymm14, %ymm0
7016 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
7017 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
7018 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
7019 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
7020 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
7021 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24
7022 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
7023 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9
7024 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
7025 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
7026 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
7027 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7028 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
7029 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
7030 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7031 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7032 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1
7033 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2
7034 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
7035 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
7036 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
7037 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16
7038 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
7039 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
7040 ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
7041 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
7042 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3
7043 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm7
7044 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
7045 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
7046 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
7047 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
7048 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
7049 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7050 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7]
7051 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
7052 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
7053 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
7054 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
7055 ; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,1]
7056 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3],ymm2[4,5,6,7,8,9,10],ymm10[11],ymm2[12,13,14,15]
7057 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
7058 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14
7059 ; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3,4,5],xmm14[6],xmm10[7]
7060 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
7061 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7062 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7063 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
7064 ; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
7065 ; AVX512-NEXT: vpor %ymm2, %ymm10, %ymm2
7066 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7067 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
7068 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
7069 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
7070 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
7071 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm2[2,3,0,1]
7072 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4],ymm2[5,6,7,8,9,10,11],ymm14[12],ymm2[13,14,15]
7073 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7074 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm9
7075 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm14[1],xmm9[2],xmm14[3],xmm9[4,5,6,7]
7076 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
7077 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7078 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7079 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
7080 ; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
7081 ; AVX512-NEXT: vpor %ymm2, %ymm9, %ymm2
7082 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7083 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
7084 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
7085 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6],ymm9[7,8],ymm2[9,10,11,12,13,14],ymm9[15]
7086 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
7087 ; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm14
7088 ; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
7089 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,2,3,4,5,6,7]
7090 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
7091 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
7092 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
7093 ; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
7094 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
7095 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
7096 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
7097 ; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6,7,8],ymm11[9],ymm9[10,11,12,13,14,15]
7098 ; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm11
7099 ; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm12
7100 ; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm14
7101 ; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm25
7102 ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
7103 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26
7104 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7]
7105 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,4,6,7]
7106 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
7107 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7108 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
7109 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7110 ; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
7111 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm3[2,3,4,5,6,7]
7112 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7]
7113 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm8
7114 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2],xmm0[3],xmm8[4,5,6,7]
7115 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
7116 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
7117 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
7118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
7119 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
7120 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
7121 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
7122 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,3,1]
7123 ; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
7124 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3,4,5,6,7]
7125 ; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
7126 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7127 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
7128 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
7129 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
7130 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
7131 ; AVX512-NEXT: vpternlogq $184, %zmm21, %zmm4, %zmm11
7132 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
7133 ; AVX512-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm22
7134 ; AVX512-NEXT: vpternlogq $184, %zmm22, %zmm4, %zmm12
7135 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7136 ; AVX512-NEXT: vpternlogq $184, %zmm19, %zmm4, %zmm23
7137 ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00
7138 ; AVX512-NEXT: kmovw %eax, %k1
7139 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm23 {%k1}
7140 ; AVX512-NEXT: vpternlogq $184, %zmm20, %zmm4, %zmm27
7141 ; AVX512-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1}
7142 ; AVX512-NEXT: vpternlogq $226, %zmm18, %zmm4, %zmm1
7143 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1}
7144 ; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi)
7145 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx)
7146 ; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx)
7147 ; AVX512-NEXT: vmovdqa64 %zmm27, (%r8)
7148 ; AVX512-NEXT: vmovdqa64 %zmm1, (%r9)
7149 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7150 ; AVX512-NEXT: vpternlogq $226, %zmm17, %zmm4, %zmm2
7151 ; AVX512-NEXT: vmovdqa32 %zmm26, %zmm2 {%k1}
7152 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax)
7153 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7154 ; AVX512-NEXT: vpternlogq $226, %zmm16, %zmm4, %zmm3
7155 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
7156 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
7157 ; AVX512-NEXT: vzeroupper
7160 ; AVX512-FCP-LABEL: load_i16_stride7_vf32:
7161 ; AVX512-FCP: # %bb.0:
7162 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29
7163 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
7164 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
7165 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22
7166 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
7167 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
7168 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
7169 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
7170 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm13
7171 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,4,8,11,15]
7172 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
7173 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm4
7174 ; AVX512-FCP-NEXT: vpermd %zmm9, %zmm16, %zmm0
7175 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
7176 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm2
7177 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
7178 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm5, %zmm5
7179 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6
7180 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
7181 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,1,0,2]
7182 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
7183 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm7
7184 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
7185 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
7186 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
7187 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7188 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
7189 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
7190 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
7191 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
7192 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
7193 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
7194 ; AVX512-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23
7195 ; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8
7196 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm12
7197 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
7198 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
7199 ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0
7200 ; AVX512-FCP-NEXT: kmovw %eax, %k1
7201 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
7202 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm7
7203 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
7204 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
7205 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14
7206 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
7207 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
7208 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
7209 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
7210 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
7211 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
7212 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
7213 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
7214 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26
7215 ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm14
7216 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
7217 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
7218 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
7219 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7220 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
7221 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7222 ; AVX512-FCP-NEXT: vporq %ymm4, %ymm0, %ymm20
7223 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
7224 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
7225 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7]
7226 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
7227 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
7228 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
7229 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7230 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
7231 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
7232 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7]
7233 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
7234 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24
7235 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7236 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
7237 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7238 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
7239 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7240 ; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm21
7241 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
7242 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
7243 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
7244 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
7245 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
7246 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7247 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
7248 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2
7249 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
7250 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7]
7251 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
7252 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
7253 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm15
7254 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
7255 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7256 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
7257 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm3
7258 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
7259 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
7260 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7]
7261 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
7262 ; AVX512-FCP-NEXT: vpermd %zmm9, %zmm10, %zmm3
7263 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
7264 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
7265 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
7266 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
7267 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7268 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
7269 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15]
7270 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,5,8,12,15]
7271 ; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
7272 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm13
7273 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
7274 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2
7275 ; AVX512-FCP-NEXT: vpermd %zmm9, %zmm19, %zmm9
7276 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm9
7277 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm9[6,7]
7278 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7279 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9
7280 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7]
7281 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm3
7282 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7283 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7284 ; AVX512-FCP-NEXT: vporq %ymm3, %ymm2, %ymm18
7285 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7]
7286 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7287 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
7288 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
7289 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
7290 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm28[0,1,1,3]
7291 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
7292 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
7293 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
7294 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19
7295 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm13
7296 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
7297 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7]
7298 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
7299 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
7300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
7301 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
7302 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
7303 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
7304 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
7305 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7306 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10
7307 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7308 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
7309 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
7310 ; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm9
7311 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
7312 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
7313 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
7314 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
7315 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
7316 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
7317 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7318 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
7319 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
7320 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
7321 ; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm2
7322 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm4
7323 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
7324 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm16, %zmm4
7325 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
7326 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm16
7327 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
7328 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm0
7329 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
7330 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
7331 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
7332 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
7333 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
7334 ; AVX512-FCP-NEXT: vpsrld $16, %xmm12, %xmm2
7335 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
7336 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm25
7337 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
7338 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm0
7339 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2
7340 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
7341 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7342 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm4
7343 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
7344 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
7345 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7]
7346 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7347 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm0, %zmm30, %zmm16
7348 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7]
7349 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
7350 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
7351 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
7352 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
7353 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
7354 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
7355 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7356 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,6,9,13,0]
7357 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10
7358 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7359 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
7360 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
7361 ; AVX512-FCP-NEXT: vpor %ymm4, %ymm10, %ymm4
7362 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
7363 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
7364 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
7365 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
7366 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
7367 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
7368 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,7,11,14,0,0,0]
7369 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26
7370 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm10
7371 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
7372 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2,3,4,5,6,7]
7373 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7]
7374 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
7375 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
7376 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27
7377 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0]
7378 ; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
7379 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10
7380 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28
7381 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
7382 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10
7383 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13]
7384 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm9, %zmm9
7385 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1
7386 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
7387 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
7388 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
7389 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm10
7390 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
7391 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9
7392 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2],xmm1[3],xmm9[4,5,6,7]
7393 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,0,6,10,13,0]
7394 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm12
7395 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm9, %zmm9
7396 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
7397 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
7398 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7399 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
7400 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
7401 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1
7402 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
7403 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,8,11,15,0,0,0]
7404 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9
7405 ; AVX512-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm0
7406 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
7407 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
7408 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
7409 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
7410 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
7411 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
7412 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
7413 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7]
7414 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7415 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
7416 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
7417 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
7418 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
7419 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
7420 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
7421 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
7422 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
7423 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm4
7424 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
7425 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
7426 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
7427 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7428 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
7429 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
7430 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
7431 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
7432 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm23, %zmm3, %zmm26
7433 ; AVX512-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm20, %zmm24
7434 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm24, %zmm3, %zmm27
7435 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm21, %zmm30, %zmm15
7436 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
7437 ; AVX512-FCP-NEXT: kmovw %eax, %k1
7438 ; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1}
7439 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm18, %zmm30, %zmm19
7440 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1}
7441 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1}
7442 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm25, %zmm30, %zmm11
7443 ; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1}
7444 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
7445 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx)
7446 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
7447 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8)
7448 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
7449 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7450 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
7451 ; AVX512-FCP-NEXT: vpternlogq $226, %zmm10, %zmm30, %zmm0
7452 ; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
7453 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7454 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7455 ; AVX512-FCP-NEXT: vzeroupper
7456 ; AVX512-FCP-NEXT: retq
7458 ; AVX512DQ-LABEL: load_i16_stride7_vf32:
7459 ; AVX512DQ: # %bb.0:
7460 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2
7461 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3
7462 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
7463 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10
7464 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm1[1],ymm10[2,3,4],ymm1[5],ymm10[6,7]
7465 ; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm4
7466 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
7467 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7468 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
7469 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm12
7470 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11
7471 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
7472 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
7473 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
7474 ; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm18
7475 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm8
7476 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm9
7477 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
7478 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
7479 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
7480 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
7481 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
7482 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
7483 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
7484 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
7485 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
7486 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,1,4,5,6,5]
7487 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
7488 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
7489 ; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm3
7490 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm6
7491 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,1,0,3]
7492 ; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm13
7493 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
7494 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
7495 ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0
7496 ; AVX512DQ-NEXT: kmovw %eax, %k1
7497 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm18 {%k1}
7498 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6
7499 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm5
7500 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14
7501 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2
7502 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7]
7503 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
7504 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7505 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
7506 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
7507 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
7508 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7509 ; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
7510 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7511 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7]
7512 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7513 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
7514 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
7515 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7516 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
7517 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
7518 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
7519 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
7520 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
7521 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
7522 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21
7523 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
7524 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
7525 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
7526 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7527 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
7528 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7529 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
7530 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7531 ; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm19
7532 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
7533 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
7534 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
7535 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
7536 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1
7537 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25
7538 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7539 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,1,2]
7540 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
7541 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
7542 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
7543 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
7544 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
7545 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26
7546 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
7547 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
7548 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
7549 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
7550 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7551 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7]
7552 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm15
7553 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7554 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
7555 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7556 ; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm17
7557 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3]
7558 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
7559 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
7560 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
7561 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
7562 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7563 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,1,2,1,4,5,6,5]
7564 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
7565 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
7566 ; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm1
7567 ; AVX512DQ-NEXT: vpsrlq $48, %xmm14, %xmm2
7568 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
7569 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm20
7570 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
7571 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
7572 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
7573 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
7574 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7575 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm2
7576 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm7
7577 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm2[6],xmm7[7]
7578 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29
7579 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
7580 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
7581 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
7582 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
7583 ; AVX512DQ-NEXT: vpsrld $16, %xmm13, %xmm1
7584 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm27
7585 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
7586 ; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm28
7587 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24
7588 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm12
7589 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
7590 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
7591 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
7592 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm1
7593 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm14
7594 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7]
7595 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
7596 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
7597 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
7598 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
7599 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
7600 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
7601 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
7602 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm5
7603 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm13
7604 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7]
7605 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7606 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
7607 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
7608 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
7609 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7610 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7611 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
7612 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30
7613 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7]
7614 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
7615 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
7616 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
7617 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
7618 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15]
7619 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
7620 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
7621 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
7622 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
7623 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
7624 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7625 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
7626 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
7627 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
7628 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
7629 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7630 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7631 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
7632 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31
7633 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
7634 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7635 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
7636 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
7637 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7]
7638 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7639 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
7640 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
7641 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
7642 ; AVX512DQ-NEXT: vpternlogq $244, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
7643 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
7644 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
7645 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
7646 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
7647 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
7648 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
7649 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7650 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
7651 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7652 ; AVX512DQ-NEXT: vpternlogq $184, %zmm19, %zmm25, %zmm22
7653 ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
7654 ; AVX512DQ-NEXT: kmovw %eax, %k1
7655 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1}
7656 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
7657 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
7658 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
7659 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
7660 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
7661 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7662 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4]
7663 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
7664 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
7665 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2
7666 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
7667 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
7668 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
7669 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7]
7670 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
7671 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
7672 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7]
7673 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7674 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
7675 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
7676 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7]
7677 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0]
7678 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7679 ; AVX512DQ-NEXT: vpor %ymm3, %ymm0, %ymm0
7680 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
7681 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
7682 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7683 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
7684 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
7685 ; AVX512DQ-NEXT: vpternlogq $184, %zmm17, %zmm25, %zmm19
7686 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
7687 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
7688 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
7689 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
7690 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
7691 ; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm23
7692 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
7693 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26
7694 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7695 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
7696 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
7697 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
7698 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
7699 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3,4,5,6,7]
7700 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7]
7701 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7702 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
7703 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
7704 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
7705 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
7706 ; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0
7707 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
7708 ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12
7709 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3,4,5],xmm12[6],xmm15[7]
7710 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
7711 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7712 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7713 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
7714 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
7715 ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3
7716 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
7717 ; AVX512DQ-NEXT: vpternlogq $226, %zmm20, %zmm25, %zmm11
7718 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
7719 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
7720 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
7721 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
7722 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
7723 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1]
7724 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7,8,9,10,11],ymm12[12],ymm3[13,14,15]
7725 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
7726 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20
7727 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
7728 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7]
7729 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
7730 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7731 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[12,13,26,27,24,25,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7732 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
7733 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
7734 ; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3
7735 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
7736 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17
7737 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
7738 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3
7739 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
7740 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15]
7741 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
7742 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
7743 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
7744 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8
7745 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm9
7746 ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0
7747 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
7748 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
7749 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
7750 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
7751 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
7752 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
7753 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7],ymm12[8,9,10,11,12],ymm7[13,14,15]
7754 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
7755 ; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4
7756 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
7757 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
7758 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
7759 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm27
7760 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm7
7761 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
7762 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7]
7763 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
7764 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
7765 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
7766 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
7767 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
7768 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
7769 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
7770 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7]
7771 ; AVX512DQ-NEXT: vpternlogq $226, %zmm24, %zmm25, %zmm2
7772 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1}
7773 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7]
7774 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
7775 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
7776 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7]
7777 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,4,6,7]
7778 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
7779 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
7780 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
7781 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
7782 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
7783 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
7784 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
7785 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
7786 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7]
7787 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
7788 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
7789 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
7790 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
7791 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
7792 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
7793 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
7794 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7]
7795 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1]
7796 ; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
7797 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7]
7798 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
7799 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
7800 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
7801 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
7802 ; AVX512DQ-NEXT: vpternlogq $226, %zmm27, %zmm25, %zmm3
7803 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
7804 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
7805 ; AVX512DQ-NEXT: vpternlogq $184, %zmm18, %zmm1, %zmm8
7806 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
7807 ; AVX512DQ-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm21
7808 ; AVX512DQ-NEXT: vpternlogq $184, %zmm21, %zmm1, %zmm9
7809 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi)
7810 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
7811 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rcx)
7812 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8)
7813 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9)
7814 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
7815 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax)
7816 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
7817 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
7818 ; AVX512DQ-NEXT: vzeroupper
7819 ; AVX512DQ-NEXT: retq
7821 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf32:
7822 ; AVX512DQ-FCP: # %bb.0:
7823 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
7824 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30
7825 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
7826 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22
7827 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
7828 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
7829 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3
7830 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
7831 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4
7832 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
7833 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
7834 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm10
7835 ; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm18, %zmm0
7836 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
7837 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm2, %zmm2
7838 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
7839 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5
7840 ; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6
7841 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
7842 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm25[0,1,0,2]
7843 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
7844 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm7
7845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm29
7846 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
7847 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
7848 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7849 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
7850 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
7851 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
7852 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
7853 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7]
7854 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
7855 ; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23
7856 ; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8
7857 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm13
7858 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
7859 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
7860 ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0
7861 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
7862 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
7863 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm7
7864 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
7865 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
7866 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14
7867 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
7868 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
7869 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
7870 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
7871 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
7872 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
7873 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
7874 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
7875 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27
7876 ; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
7877 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
7878 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
7879 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
7880 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7881 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
7882 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7883 ; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm0, %ymm21
7884 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
7885 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
7886 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
7887 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
7888 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
7889 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
7890 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7891 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
7892 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
7893 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7]
7894 ; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9
7895 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm24
7896 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7897 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
7898 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9
7899 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
7900 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7901 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm9
7902 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
7903 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
7904 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
7905 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
7906 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
7907 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
7908 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5]
7909 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm12
7910 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
7911 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7]
7912 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
7913 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
7914 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm14
7915 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
7916 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm16
7917 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
7918 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
7919 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
7920 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm14
7921 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
7922 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
7923 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6],xmm14[7]
7924 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
7925 ; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm17, %zmm14
7926 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
7927 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
7928 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm28
7929 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
7930 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm12
7931 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
7932 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15]
7933 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,5,8,12,15]
7934 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
7935 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm12
7936 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
7937 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0
7938 ; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm19, %zmm3
7939 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
7940 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3
7941 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
7942 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm9, %zmm19, %zmm16
7943 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
7944 ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
7945 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
7946 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1}
7947 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7948 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
7949 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
7950 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
7951 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
7952 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm2
7953 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
7954 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
7955 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
7956 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
7957 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
7958 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3]
7959 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
7960 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
7961 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
7962 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm20
7963 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
7964 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
7965 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7]
7966 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
7967 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7]
7968 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
7969 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
7970 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7]
7971 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9]
7972 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
7973 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7974 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14
7975 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
7976 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7],ymm14[8,9,10],ymm9[11,12,13,14,15]
7977 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
7978 ; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12
7979 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
7980 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm2, %zmm19, %zmm20
7981 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1}
7982 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
7983 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
7984 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm12
7985 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
7986 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
7987 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
7988 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0]
7989 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm14
7990 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
7991 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm14
7992 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7]
7993 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
7994 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
7995 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
7996 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm9
7997 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
7998 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
7999 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
8000 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7]
8001 ; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm10
8002 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11
8003 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
8004 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
8005 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm12, %zmm19, %zmm10
8006 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
8007 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
8008 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
8009 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
8010 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
8011 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
8012 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0]
8013 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm12, %zmm12
8014 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
8015 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8016 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8017 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
8018 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
8019 ; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm11, %ymm11
8020 ; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm18, %zmm12
8021 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
8022 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
8023 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
8024 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm0
8025 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
8026 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
8027 ; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm11
8028 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
8029 ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm13, %xmm11
8030 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
8031 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm12
8032 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
8033 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
8034 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
8035 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,4,7,11,14,0,0,0]
8036 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm11
8037 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
8038 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
8039 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
8040 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3,4,5,6,7]
8041 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7]
8042 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
8043 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
8044 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
8045 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
8046 ; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2
8047 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
8048 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
8049 ; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm17, %zmm9
8050 ; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9
8051 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15]
8052 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
8053 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13
8054 ; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9
8055 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
8056 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
8057 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13
8058 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7]
8059 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0]
8060 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm13
8061 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
8062 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
8063 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
8064 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15]
8065 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
8066 ; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9
8067 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
8068 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm12, %zmm19, %zmm11
8069 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
8070 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0]
8071 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9
8072 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm0
8073 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
8074 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
8075 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
8076 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
8077 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
8078 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
8079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
8080 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
8081 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
8082 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
8083 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
8084 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
8085 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
8086 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
8087 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
8088 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
8089 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm5
8090 ; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm4, %zmm4
8091 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
8092 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
8093 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
8094 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
8095 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
8096 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
8097 ; AVX512DQ-FCP-NEXT: vpternlogq $226, %zmm2, %zmm19, %zmm0
8098 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
8099 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
8100 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm23, %zmm1, %zmm9
8101 ; AVX512DQ-FCP-NEXT: vpternlogq $228, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm21, %zmm24
8102 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm24, %zmm1, %zmm5
8103 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
8104 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
8105 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx)
8106 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
8107 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9)
8108 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8109 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
8110 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8111 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
8112 ; AVX512DQ-FCP-NEXT: vzeroupper
8113 ; AVX512DQ-FCP-NEXT: retq
8115 ; AVX512BW-LABEL: load_i16_stride7_vf32:
8116 ; AVX512BW: # %bb.0:
8117 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8118 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
8119 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm3
8120 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm6
8121 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm7
8122 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
8123 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
8124 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
8125 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
8126 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
8127 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8128 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
8129 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
8130 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
8131 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
8132 ; AVX512BW-NEXT: kmovd %edi, %k1
8133 ; AVX512BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
8134 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
8135 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8136 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8137 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
8138 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8139 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8140 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
8141 ; AVX512BW-NEXT: kmovd %edi, %k1
8142 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
8143 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
8144 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8145 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8146 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
8147 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8148 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8149 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
8150 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
8151 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
8152 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8153 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
8154 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
8155 ; AVX512BW-NEXT: kmovd %edi, %k2
8156 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
8157 ; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
8158 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
8159 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8160 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
8161 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
8162 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8163 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
8164 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
8165 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8166 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
8167 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
8168 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
8169 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
8170 ; AVX512BW-NEXT: kmovd %edi, %k1
8171 ; AVX512BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
8172 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
8173 ; AVX512BW-NEXT: kmovd %edi, %k2
8174 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
8175 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
8176 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8177 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8178 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
8179 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8180 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8181 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
8182 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8183 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8184 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
8185 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
8186 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
8187 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
8188 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
8189 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8190 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8191 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
8192 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8193 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8194 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
8195 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8196 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8197 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
8198 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
8199 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
8200 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
8201 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
8202 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8203 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8204 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
8205 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8206 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8207 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
8208 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8209 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
8210 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
8211 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
8212 ; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
8213 ; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
8214 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
8215 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8216 ; AVX512BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
8217 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
8218 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8219 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
8220 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
8221 ; AVX512BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8222 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
8223 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
8224 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
8225 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
8226 ; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
8227 ; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rsi)
8228 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx)
8229 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rcx)
8230 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r8)
8231 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r9)
8232 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
8233 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax)
8234 ; AVX512BW-NEXT: vzeroupper
8235 ; AVX512BW-NEXT: retq
8237 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf32:
8238 ; AVX512BW-FCP: # %bb.0:
8239 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8240 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8241 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3
8242 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
8243 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7
8244 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
8245 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
8246 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
8247 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
8248 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
8249 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8250 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
8251 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
8252 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
8253 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0
8254 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
8255 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
8256 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
8257 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8258 ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8259 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
8260 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8261 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8262 ; AVX512BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000
8263 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
8264 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
8265 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
8266 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8267 ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8268 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
8269 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8270 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8271 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
8272 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
8273 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
8274 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8275 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
8276 ; AVX512BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF
8277 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
8278 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
8279 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
8280 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
8281 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8282 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
8283 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
8284 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8285 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
8286 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
8287 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8288 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
8289 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
8290 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
8291 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00
8292 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
8293 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
8294 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
8295 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
8296 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
8297 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
8298 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8299 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8300 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
8301 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8302 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8303 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
8304 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8305 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8306 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
8307 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
8308 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
8309 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
8310 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
8311 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8312 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8313 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
8314 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8315 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8316 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
8317 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8318 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8319 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
8320 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
8321 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
8322 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
8323 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
8324 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8325 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8326 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
8327 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8328 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8329 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
8330 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8331 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
8332 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
8333 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
8334 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
8335 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
8336 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
8337 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8338 ; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
8339 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
8340 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8341 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
8342 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
8343 ; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8344 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
8345 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
8346 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
8347 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
8348 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
8349 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
8350 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
8351 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx)
8352 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8)
8353 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9)
8354 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
8355 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
8356 ; AVX512BW-FCP-NEXT: vzeroupper
8357 ; AVX512BW-FCP-NEXT: retq
8359 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf32:
8360 ; AVX512DQ-BW: # %bb.0:
8361 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8362 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
8363 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm3
8364 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm6
8365 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm7
8366 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
8367 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
8368 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4
8369 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5
8370 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
8371 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8372 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
8373 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
8374 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
8375 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0
8376 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
8377 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
8378 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
8379 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8380 ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8381 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
8382 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8383 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8384 ; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
8385 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
8386 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
8387 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
8388 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8389 ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8390 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
8391 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8392 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8393 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
8394 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
8395 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
8396 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8397 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
8398 ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
8399 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
8400 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
8401 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
8402 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
8403 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8404 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
8405 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
8406 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8407 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
8408 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
8409 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8410 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
8411 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
8412 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
8413 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
8414 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
8415 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
8416 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00
8417 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
8418 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
8419 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
8420 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8421 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8422 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
8423 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8424 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8425 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
8426 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8427 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8428 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
8429 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
8430 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
8431 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
8432 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
8433 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8434 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8435 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
8436 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8437 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8438 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
8439 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8440 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8441 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
8442 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
8443 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
8444 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
8445 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
8446 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8447 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8448 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
8449 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8450 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8451 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
8452 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8453 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
8454 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
8455 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
8456 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
8457 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
8458 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
8459 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8460 ; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
8461 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
8462 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8463 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
8464 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
8465 ; AVX512DQ-BW-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8466 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
8467 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
8468 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
8469 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
8470 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
8471 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rsi)
8472 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx)
8473 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rcx)
8474 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r8)
8475 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r9)
8476 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10)
8477 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax)
8478 ; AVX512DQ-BW-NEXT: vzeroupper
8479 ; AVX512DQ-BW-NEXT: retq
8481 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf32:
8482 ; AVX512DQ-BW-FCP: # %bb.0:
8483 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8484 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8485 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3
8486 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
8487 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm7
8488 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
8489 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
8490 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
8491 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
8492 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
8493 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8494 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm8
8495 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
8496 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
8497 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0
8498 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
8499 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
8500 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
8501 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8502 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8503 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
8504 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8505 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8506 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000
8507 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
8508 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k1}
8509 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
8510 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8511 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm8
8512 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
8513 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8514 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm8, %zmm9
8515 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
8516 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
8517 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
8518 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
8519 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm8
8520 ; AVX512DQ-BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF
8521 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
8522 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm8 {%k2}
8523 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm8 {%k1}
8524 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
8525 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8526 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm9
8527 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
8528 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8529 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm9, %zmm10
8530 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
8531 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8532 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm11
8533 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
8534 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
8535 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00
8536 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
8537 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm9 {%k1}
8538 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
8539 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
8540 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm9 {%k2}
8541 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
8542 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8543 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8544 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
8545 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8546 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8547 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
8548 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8549 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8550 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
8551 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
8552 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm12 {%k1}
8553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k2}
8554 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
8555 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8556 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8557 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
8558 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8559 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8560 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
8561 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8562 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm4, %zmm10
8563 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
8564 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm13
8565 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm13 {%k1}
8566 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm13 {%k2}
8567 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
8568 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8569 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm6, %zmm10
8570 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
8571 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
8572 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm11
8573 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
8574 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8575 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm10
8576 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
8577 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm14
8578 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm14 {%k1}
8579 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm14 {%k2}
8580 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
8581 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
8582 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm7, %zmm10
8583 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
8584 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
8585 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm10, %zmm6
8586 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
8587 ; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
8588 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm5, %zmm3
8589 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
8590 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm4
8591 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1}
8592 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm6, %zmm4 {%k2}
8593 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rsi)
8594 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
8595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx)
8596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8)
8597 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r9)
8598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
8599 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
8600 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
8601 ; AVX512DQ-BW-FCP-NEXT: retq
8602 %wide.vec = load <224 x i16>, ptr %in.vec, align 64
8603 %strided.vec0 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
8604 %strided.vec1 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
8605 %strided.vec2 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
8606 %strided.vec3 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
8607 %strided.vec4 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
8608 %strided.vec5 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
8609 %strided.vec6 = shufflevector <224 x i16> %wide.vec, <224 x i16> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
8610 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
8611 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
8612 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
8613 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
8614 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
8615 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
8616 store <32 x i16> %strided.vec6, ptr %out.vec6, align 64
8620 define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
8621 ; SSE-LABEL: load_i16_stride7_vf64:
8623 ; SSE-NEXT: subq $1352, %rsp # imm = 0x548
8624 ; SSE-NEXT: movdqa 640(%rdi), %xmm9
8625 ; SSE-NEXT: movdqa 624(%rdi), %xmm12
8626 ; SSE-NEXT: movdqa %xmm12, (%rsp) # 16-byte Spill
8627 ; SSE-NEXT: movdqa 112(%rdi), %xmm8
8628 ; SSE-NEXT: movdqa 128(%rdi), %xmm10
8629 ; SSE-NEXT: movaps 160(%rdi), %xmm6
8630 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8631 ; SSE-NEXT: movaps 144(%rdi), %xmm13
8632 ; SSE-NEXT: movdqa 192(%rdi), %xmm2
8633 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8634 ; SSE-NEXT: movdqa 176(%rdi), %xmm4
8635 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8636 ; SSE-NEXT: movdqa 208(%rdi), %xmm11
8637 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[0,0,0,0]
8638 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8639 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
8640 ; SSE-NEXT: movdqa %xmm3, %xmm1
8641 ; SSE-NEXT: pandn %xmm0, %xmm1
8642 ; SSE-NEXT: movdqa %xmm4, %xmm0
8643 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8644 ; SSE-NEXT: pand %xmm3, %xmm0
8645 ; SSE-NEXT: por %xmm1, %xmm0
8646 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [65535,65535,65535,65535,65535,0,0,0]
8647 ; SSE-NEXT: movdqa %xmm14, %xmm1
8648 ; SSE-NEXT: pandn %xmm0, %xmm1
8649 ; SSE-NEXT: movaps %xmm13, %xmm0
8650 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8651 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm6[2,2]
8652 ; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,65535,65535,65535]
8653 ; SSE-NEXT: movaps %xmm6, %xmm2
8654 ; SSE-NEXT: andnps %xmm0, %xmm2
8655 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,3,3]
8656 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8657 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,0,3]
8658 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8659 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
8660 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8661 ; SSE-NEXT: movdqa 656(%rdi), %xmm0
8662 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8663 ; SSE-NEXT: pand %xmm6, %xmm4
8664 ; SSE-NEXT: por %xmm2, %xmm4
8665 ; SSE-NEXT: pand %xmm14, %xmm4
8666 ; SSE-NEXT: por %xmm1, %xmm4
8667 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8668 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8669 ; SSE-NEXT: movdqa %xmm3, %xmm1
8670 ; SSE-NEXT: pandn %xmm0, %xmm1
8671 ; SSE-NEXT: movdqa %xmm12, %xmm0
8672 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8673 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8674 ; SSE-NEXT: pand %xmm3, %xmm0
8675 ; SSE-NEXT: por %xmm1, %xmm0
8676 ; SSE-NEXT: movdqa %xmm14, %xmm1
8677 ; SSE-NEXT: pandn %xmm0, %xmm1
8678 ; SSE-NEXT: movaps 608(%rdi), %xmm2
8679 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8680 ; SSE-NEXT: movaps 592(%rdi), %xmm0
8681 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8682 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8683 ; SSE-NEXT: movaps %xmm6, %xmm2
8684 ; SSE-NEXT: andnps %xmm0, %xmm2
8685 ; SSE-NEXT: movdqa 560(%rdi), %xmm15
8686 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
8687 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8688 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8689 ; SSE-NEXT: movdqa 576(%rdi), %xmm5
8690 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,2,3,3]
8691 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8692 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8693 ; SSE-NEXT: pand %xmm6, %xmm4
8694 ; SSE-NEXT: por %xmm2, %xmm4
8695 ; SSE-NEXT: pand %xmm14, %xmm4
8696 ; SSE-NEXT: por %xmm1, %xmm4
8697 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8698 ; SSE-NEXT: movdqa 96(%rdi), %xmm0
8699 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8700 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8701 ; SSE-NEXT: movdqa %xmm3, %xmm1
8702 ; SSE-NEXT: pandn %xmm0, %xmm1
8703 ; SSE-NEXT: movdqa 80(%rdi), %xmm2
8704 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8705 ; SSE-NEXT: movdqa 64(%rdi), %xmm0
8706 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8707 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8708 ; SSE-NEXT: pand %xmm3, %xmm0
8709 ; SSE-NEXT: por %xmm1, %xmm0
8710 ; SSE-NEXT: movdqa %xmm14, %xmm1
8711 ; SSE-NEXT: pandn %xmm0, %xmm1
8712 ; SSE-NEXT: movaps 32(%rdi), %xmm0
8713 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8714 ; SSE-NEXT: movaps 48(%rdi), %xmm4
8715 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8716 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm4[2,2]
8717 ; SSE-NEXT: movaps %xmm6, %xmm2
8718 ; SSE-NEXT: andnps %xmm0, %xmm2
8719 ; SSE-NEXT: movdqa (%rdi), %xmm0
8720 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8721 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8722 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8723 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
8724 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8725 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8726 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8727 ; SSE-NEXT: pand %xmm6, %xmm4
8728 ; SSE-NEXT: por %xmm2, %xmm4
8729 ; SSE-NEXT: pand %xmm14, %xmm4
8730 ; SSE-NEXT: por %xmm1, %xmm4
8731 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8732 ; SSE-NEXT: movdqa 544(%rdi), %xmm0
8733 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8734 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8735 ; SSE-NEXT: movdqa %xmm3, %xmm1
8736 ; SSE-NEXT: pandn %xmm0, %xmm1
8737 ; SSE-NEXT: movdqa 528(%rdi), %xmm2
8738 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8739 ; SSE-NEXT: movdqa 512(%rdi), %xmm0
8740 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8741 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8742 ; SSE-NEXT: pand %xmm3, %xmm0
8743 ; SSE-NEXT: por %xmm1, %xmm0
8744 ; SSE-NEXT: movdqa %xmm14, %xmm1
8745 ; SSE-NEXT: pandn %xmm0, %xmm1
8746 ; SSE-NEXT: movaps 496(%rdi), %xmm2
8747 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8748 ; SSE-NEXT: movaps 480(%rdi), %xmm0
8749 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8750 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8751 ; SSE-NEXT: movaps %xmm6, %xmm2
8752 ; SSE-NEXT: andnps %xmm0, %xmm2
8753 ; SSE-NEXT: movdqa 448(%rdi), %xmm0
8754 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8755 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8756 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8757 ; SSE-NEXT: movdqa 464(%rdi), %xmm12
8758 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,3,3]
8759 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8760 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8761 ; SSE-NEXT: pand %xmm6, %xmm4
8762 ; SSE-NEXT: por %xmm2, %xmm4
8763 ; SSE-NEXT: pand %xmm14, %xmm4
8764 ; SSE-NEXT: por %xmm1, %xmm4
8765 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8766 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
8767 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8768 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8769 ; SSE-NEXT: movdqa %xmm3, %xmm1
8770 ; SSE-NEXT: pandn %xmm0, %xmm1
8771 ; SSE-NEXT: movdqa 416(%rdi), %xmm2
8772 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8773 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
8774 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8775 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8776 ; SSE-NEXT: pand %xmm3, %xmm0
8777 ; SSE-NEXT: por %xmm1, %xmm0
8778 ; SSE-NEXT: movdqa %xmm14, %xmm1
8779 ; SSE-NEXT: pandn %xmm0, %xmm1
8780 ; SSE-NEXT: movaps 384(%rdi), %xmm2
8781 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8782 ; SSE-NEXT: movaps 368(%rdi), %xmm0
8783 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8784 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8785 ; SSE-NEXT: movaps %xmm6, %xmm2
8786 ; SSE-NEXT: andnps %xmm0, %xmm2
8787 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
8788 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8789 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8790 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8791 ; SSE-NEXT: movdqa 352(%rdi), %xmm0
8792 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8793 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8794 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8795 ; SSE-NEXT: pand %xmm6, %xmm4
8796 ; SSE-NEXT: por %xmm2, %xmm4
8797 ; SSE-NEXT: pand %xmm14, %xmm4
8798 ; SSE-NEXT: por %xmm1, %xmm4
8799 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8800 ; SSE-NEXT: movdqa 880(%rdi), %xmm0
8801 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8802 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8803 ; SSE-NEXT: movdqa %xmm3, %xmm1
8804 ; SSE-NEXT: pandn %xmm0, %xmm1
8805 ; SSE-NEXT: movdqa 864(%rdi), %xmm4
8806 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8807 ; SSE-NEXT: movdqa 848(%rdi), %xmm0
8808 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8809 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8810 ; SSE-NEXT: pand %xmm3, %xmm0
8811 ; SSE-NEXT: por %xmm1, %xmm0
8812 ; SSE-NEXT: movdqa %xmm14, %xmm1
8813 ; SSE-NEXT: pandn %xmm0, %xmm1
8814 ; SSE-NEXT: movaps 832(%rdi), %xmm2
8815 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8816 ; SSE-NEXT: movaps 816(%rdi), %xmm0
8817 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8818 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8819 ; SSE-NEXT: movaps %xmm6, %xmm2
8820 ; SSE-NEXT: andnps %xmm0, %xmm2
8821 ; SSE-NEXT: movdqa 784(%rdi), %xmm0
8822 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8823 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8824 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8825 ; SSE-NEXT: movdqa 800(%rdi), %xmm0
8826 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8827 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8828 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8829 ; SSE-NEXT: pand %xmm6, %xmm4
8830 ; SSE-NEXT: por %xmm2, %xmm4
8831 ; SSE-NEXT: pand %xmm14, %xmm4
8832 ; SSE-NEXT: por %xmm1, %xmm4
8833 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8834 ; SSE-NEXT: movdqa 320(%rdi), %xmm0
8835 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8836 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8837 ; SSE-NEXT: movdqa %xmm3, %xmm1
8838 ; SSE-NEXT: pandn %xmm0, %xmm1
8839 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
8840 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8841 ; SSE-NEXT: movdqa 288(%rdi), %xmm0
8842 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8843 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8844 ; SSE-NEXT: pand %xmm3, %xmm0
8845 ; SSE-NEXT: por %xmm1, %xmm0
8846 ; SSE-NEXT: movdqa %xmm14, %xmm1
8847 ; SSE-NEXT: pandn %xmm0, %xmm1
8848 ; SSE-NEXT: movaps 272(%rdi), %xmm2
8849 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8850 ; SSE-NEXT: movaps 256(%rdi), %xmm0
8851 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8852 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8853 ; SSE-NEXT: movaps %xmm6, %xmm2
8854 ; SSE-NEXT: andnps %xmm0, %xmm2
8855 ; SSE-NEXT: movdqa 224(%rdi), %xmm0
8856 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8857 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8858 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8859 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
8860 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8861 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8862 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8863 ; SSE-NEXT: pand %xmm6, %xmm4
8864 ; SSE-NEXT: por %xmm2, %xmm4
8865 ; SSE-NEXT: pand %xmm14, %xmm4
8866 ; SSE-NEXT: por %xmm1, %xmm4
8867 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8868 ; SSE-NEXT: movdqa 768(%rdi), %xmm0
8869 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8870 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8871 ; SSE-NEXT: movdqa %xmm3, %xmm1
8872 ; SSE-NEXT: pandn %xmm0, %xmm1
8873 ; SSE-NEXT: movdqa 752(%rdi), %xmm2
8874 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8875 ; SSE-NEXT: movdqa 736(%rdi), %xmm0
8876 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8877 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8878 ; SSE-NEXT: pand %xmm3, %xmm0
8879 ; SSE-NEXT: por %xmm1, %xmm0
8880 ; SSE-NEXT: movdqa %xmm14, %xmm1
8881 ; SSE-NEXT: pandn %xmm0, %xmm1
8882 ; SSE-NEXT: movaps 720(%rdi), %xmm2
8883 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8884 ; SSE-NEXT: movaps 704(%rdi), %xmm0
8885 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8886 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2],xmm2[2,2]
8887 ; SSE-NEXT: movaps %xmm6, %xmm2
8888 ; SSE-NEXT: andnps %xmm0, %xmm2
8889 ; SSE-NEXT: movdqa 672(%rdi), %xmm0
8890 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8891 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
8892 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,7,6,7]
8893 ; SSE-NEXT: movdqa 688(%rdi), %xmm0
8894 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8895 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
8896 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3]
8897 ; SSE-NEXT: pand %xmm6, %xmm4
8898 ; SSE-NEXT: por %xmm2, %xmm4
8899 ; SSE-NEXT: pand %xmm14, %xmm4
8900 ; SSE-NEXT: por %xmm1, %xmm4
8901 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8902 ; SSE-NEXT: pslldq {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5]
8903 ; SSE-NEXT: movdqa %xmm3, %xmm1
8904 ; SSE-NEXT: pandn %xmm11, %xmm1
8905 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8906 ; SSE-NEXT: psrld $16, %xmm0
8907 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8908 ; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8909 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8910 ; SSE-NEXT: pand %xmm3, %xmm2
8911 ; SSE-NEXT: por %xmm1, %xmm2
8912 ; SSE-NEXT: movdqa %xmm14, %xmm1
8913 ; SSE-NEXT: pandn %xmm2, %xmm1
8914 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,65535,65535,65535,65535]
8915 ; SSE-NEXT: movdqa %xmm7, %xmm2
8916 ; SSE-NEXT: pandn %xmm8, %xmm2
8917 ; SSE-NEXT: pand %xmm7, %xmm10
8918 ; SSE-NEXT: por %xmm2, %xmm10
8919 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
8920 ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
8921 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[2,1,2,1]
8922 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
8923 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
8924 ; SSE-NEXT: movdqa %xmm6, %xmm0
8925 ; SSE-NEXT: pandn %xmm2, %xmm0
8926 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,3,2,3]
8927 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
8928 ; SSE-NEXT: pand %xmm6, %xmm2
8929 ; SSE-NEXT: por %xmm2, %xmm0
8930 ; SSE-NEXT: pand %xmm14, %xmm0
8931 ; SSE-NEXT: por %xmm1, %xmm0
8932 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8933 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8934 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
8935 ; SSE-NEXT: movdqa %xmm3, %xmm2
8936 ; SSE-NEXT: pandn %xmm1, %xmm2
8937 ; SSE-NEXT: psrld $16, %xmm9
8938 ; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
8939 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8940 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1]
8941 ; SSE-NEXT: pand %xmm3, %xmm4
8942 ; SSE-NEXT: por %xmm2, %xmm4
8943 ; SSE-NEXT: movdqa %xmm14, %xmm1
8944 ; SSE-NEXT: pandn %xmm4, %xmm1
8945 ; SSE-NEXT: movdqa %xmm7, %xmm2
8946 ; SSE-NEXT: pandn %xmm15, %xmm2
8947 ; SSE-NEXT: pand %xmm7, %xmm5
8948 ; SSE-NEXT: por %xmm2, %xmm5
8949 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8950 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8951 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8952 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
8953 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
8954 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
8955 ; SSE-NEXT: movdqa %xmm6, %xmm0
8956 ; SSE-NEXT: pandn %xmm2, %xmm0
8957 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,3,2,3]
8958 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
8959 ; SSE-NEXT: pand %xmm6, %xmm2
8960 ; SSE-NEXT: por %xmm2, %xmm0
8961 ; SSE-NEXT: pand %xmm14, %xmm0
8962 ; SSE-NEXT: por %xmm1, %xmm0
8963 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8964 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8965 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
8966 ; SSE-NEXT: movdqa %xmm3, %xmm2
8967 ; SSE-NEXT: pandn %xmm1, %xmm2
8968 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8969 ; SSE-NEXT: psrld $16, %xmm1
8970 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8971 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
8972 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
8973 ; SSE-NEXT: pand %xmm3, %xmm4
8974 ; SSE-NEXT: por %xmm2, %xmm4
8975 ; SSE-NEXT: movdqa %xmm14, %xmm1
8976 ; SSE-NEXT: pandn %xmm4, %xmm1
8977 ; SSE-NEXT: movdqa %xmm7, %xmm2
8978 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8979 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8980 ; SSE-NEXT: pand %xmm7, %xmm4
8981 ; SSE-NEXT: por %xmm2, %xmm4
8982 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8983 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8984 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8985 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
8986 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
8987 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
8988 ; SSE-NEXT: movdqa %xmm6, %xmm0
8989 ; SSE-NEXT: pandn %xmm2, %xmm0
8990 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
8991 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
8992 ; SSE-NEXT: pand %xmm6, %xmm2
8993 ; SSE-NEXT: por %xmm2, %xmm0
8994 ; SSE-NEXT: pand %xmm14, %xmm0
8995 ; SSE-NEXT: por %xmm1, %xmm0
8996 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8997 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8998 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
8999 ; SSE-NEXT: movdqa %xmm3, %xmm2
9000 ; SSE-NEXT: pandn %xmm1, %xmm2
9001 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9002 ; SSE-NEXT: psrld $16, %xmm1
9003 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9004 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9005 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9006 ; SSE-NEXT: pand %xmm3, %xmm4
9007 ; SSE-NEXT: por %xmm2, %xmm4
9008 ; SSE-NEXT: movdqa %xmm14, %xmm1
9009 ; SSE-NEXT: pandn %xmm4, %xmm1
9010 ; SSE-NEXT: movdqa %xmm7, %xmm2
9011 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9012 ; SSE-NEXT: pand %xmm7, %xmm12
9013 ; SSE-NEXT: por %xmm2, %xmm12
9014 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9015 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9016 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9017 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
9018 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
9019 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9020 ; SSE-NEXT: movdqa %xmm6, %xmm0
9021 ; SSE-NEXT: pandn %xmm2, %xmm0
9022 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[0,3,2,3]
9023 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
9024 ; SSE-NEXT: pand %xmm6, %xmm2
9025 ; SSE-NEXT: por %xmm2, %xmm0
9026 ; SSE-NEXT: pand %xmm14, %xmm0
9027 ; SSE-NEXT: por %xmm1, %xmm0
9028 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9029 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9030 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
9031 ; SSE-NEXT: movdqa %xmm3, %xmm2
9032 ; SSE-NEXT: pandn %xmm1, %xmm2
9033 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9034 ; SSE-NEXT: psrld $16, %xmm1
9035 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9036 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9037 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9038 ; SSE-NEXT: pand %xmm3, %xmm4
9039 ; SSE-NEXT: por %xmm2, %xmm4
9040 ; SSE-NEXT: movdqa %xmm14, %xmm1
9041 ; SSE-NEXT: pandn %xmm4, %xmm1
9042 ; SSE-NEXT: movdqa %xmm7, %xmm2
9043 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9044 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9045 ; SSE-NEXT: pand %xmm7, %xmm4
9046 ; SSE-NEXT: por %xmm2, %xmm4
9047 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9048 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9049 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9050 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
9051 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
9052 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9053 ; SSE-NEXT: movdqa %xmm6, %xmm0
9054 ; SSE-NEXT: pandn %xmm2, %xmm0
9055 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
9056 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
9057 ; SSE-NEXT: pand %xmm6, %xmm2
9058 ; SSE-NEXT: por %xmm2, %xmm0
9059 ; SSE-NEXT: pand %xmm14, %xmm0
9060 ; SSE-NEXT: por %xmm1, %xmm0
9061 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9062 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9063 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
9064 ; SSE-NEXT: movdqa %xmm3, %xmm2
9065 ; SSE-NEXT: pandn %xmm1, %xmm2
9066 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9067 ; SSE-NEXT: psrld $16, %xmm1
9068 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9069 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9070 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9071 ; SSE-NEXT: pand %xmm3, %xmm4
9072 ; SSE-NEXT: por %xmm2, %xmm4
9073 ; SSE-NEXT: movdqa %xmm14, %xmm1
9074 ; SSE-NEXT: pandn %xmm4, %xmm1
9075 ; SSE-NEXT: movdqa %xmm7, %xmm2
9076 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9077 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9078 ; SSE-NEXT: pand %xmm7, %xmm4
9079 ; SSE-NEXT: por %xmm2, %xmm4
9080 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9081 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9082 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9083 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
9084 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
9085 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9086 ; SSE-NEXT: movdqa %xmm6, %xmm0
9087 ; SSE-NEXT: pandn %xmm2, %xmm0
9088 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
9089 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
9090 ; SSE-NEXT: pand %xmm6, %xmm2
9091 ; SSE-NEXT: por %xmm2, %xmm0
9092 ; SSE-NEXT: pand %xmm14, %xmm0
9093 ; SSE-NEXT: por %xmm1, %xmm0
9094 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9095 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9096 ; SSE-NEXT: movdqa %xmm10, %xmm1
9097 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
9098 ; SSE-NEXT: movdqa %xmm3, %xmm2
9099 ; SSE-NEXT: pandn %xmm1, %xmm2
9100 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9101 ; SSE-NEXT: movdqa %xmm11, %xmm1
9102 ; SSE-NEXT: psrld $16, %xmm1
9103 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9104 ; SSE-NEXT: movdqa %xmm9, %xmm4
9105 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9106 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9107 ; SSE-NEXT: pand %xmm3, %xmm4
9108 ; SSE-NEXT: por %xmm2, %xmm4
9109 ; SSE-NEXT: movdqa %xmm14, %xmm1
9110 ; SSE-NEXT: pandn %xmm4, %xmm1
9111 ; SSE-NEXT: movdqa %xmm7, %xmm2
9112 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9113 ; SSE-NEXT: pandn %xmm12, %xmm2
9114 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9115 ; SSE-NEXT: movdqa %xmm13, %xmm4
9116 ; SSE-NEXT: pand %xmm7, %xmm4
9117 ; SSE-NEXT: por %xmm2, %xmm4
9118 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9119 ; SSE-NEXT: movdqa %xmm5, %xmm2
9120 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9121 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
9122 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
9123 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
9124 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
9125 ; SSE-NEXT: movdqa %xmm6, %xmm15
9126 ; SSE-NEXT: pandn %xmm2, %xmm15
9127 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
9128 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
9129 ; SSE-NEXT: pand %xmm6, %xmm2
9130 ; SSE-NEXT: por %xmm2, %xmm15
9131 ; SSE-NEXT: pand %xmm14, %xmm15
9132 ; SSE-NEXT: por %xmm1, %xmm15
9133 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9134 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9135 ; SSE-NEXT: movdqa %xmm15, %xmm1
9136 ; SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5]
9137 ; SSE-NEXT: movdqa %xmm3, %xmm2
9138 ; SSE-NEXT: pandn %xmm1, %xmm2
9139 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9140 ; SSE-NEXT: psrld $16, %xmm1
9141 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9142 ; SSE-NEXT: psrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
9143 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9144 ; SSE-NEXT: pand %xmm3, %xmm4
9145 ; SSE-NEXT: por %xmm2, %xmm4
9146 ; SSE-NEXT: movdqa %xmm14, %xmm1
9147 ; SSE-NEXT: pandn %xmm4, %xmm1
9148 ; SSE-NEXT: movdqa %xmm7, %xmm2
9149 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9150 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9151 ; SSE-NEXT: pand %xmm7, %xmm4
9152 ; SSE-NEXT: por %xmm2, %xmm4
9153 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
9154 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
9155 ; SSE-NEXT: pand %xmm6, %xmm2
9156 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9157 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9158 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
9159 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,1,2,1]
9160 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
9161 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
9162 ; SSE-NEXT: pandn %xmm4, %xmm6
9163 ; SSE-NEXT: por %xmm2, %xmm6
9164 ; SSE-NEXT: pand %xmm14, %xmm6
9165 ; SSE-NEXT: por %xmm1, %xmm6
9166 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9167 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9168 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
9169 ; SSE-NEXT: movdqa %xmm3, %xmm2
9170 ; SSE-NEXT: pandn %xmm1, %xmm2
9171 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9172 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9173 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9174 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
9175 ; SSE-NEXT: pand %xmm3, %xmm1
9176 ; SSE-NEXT: por %xmm2, %xmm1
9177 ; SSE-NEXT: movdqa %xmm14, %xmm2
9178 ; SSE-NEXT: pandn %xmm1, %xmm2
9179 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9180 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
9181 ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,4,7]
9182 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9183 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
9184 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm1[1]
9185 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9186 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9187 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9188 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9189 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
9190 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
9191 ; SSE-NEXT: movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
9192 ; SSE-NEXT: andps %xmm14, %xmm4
9193 ; SSE-NEXT: orps %xmm2, %xmm4
9194 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9195 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9196 ; SSE-NEXT: # xmm1 = mem[0,1,0,1]
9197 ; SSE-NEXT: movdqa %xmm3, %xmm2
9198 ; SSE-NEXT: pandn %xmm1, %xmm2
9199 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9200 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9201 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9202 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
9203 ; SSE-NEXT: pand %xmm3, %xmm1
9204 ; SSE-NEXT: por %xmm2, %xmm1
9205 ; SSE-NEXT: movdqa %xmm14, %xmm2
9206 ; SSE-NEXT: pandn %xmm1, %xmm2
9207 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9208 ; SSE-NEXT: # xmm1 = mem[0,1,0,3]
9209 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5,4,7]
9210 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9211 ; SSE-NEXT: # xmm1 = mem[2,2,3,3]
9212 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm1[1]
9213 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9214 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9215 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9216 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9217 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
9218 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
9219 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3]
9220 ; SSE-NEXT: andps %xmm14, %xmm6
9221 ; SSE-NEXT: orps %xmm2, %xmm6
9222 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9223 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
9224 ; SSE-NEXT: movdqa %xmm3, %xmm4
9225 ; SSE-NEXT: pandn %xmm2, %xmm4
9226 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
9227 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
9228 ; SSE-NEXT: pand %xmm3, %xmm2
9229 ; SSE-NEXT: por %xmm4, %xmm2
9230 ; SSE-NEXT: movdqa %xmm14, %xmm4
9231 ; SSE-NEXT: pandn %xmm2, %xmm4
9232 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
9233 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
9234 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[2,2,3,3]
9235 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
9236 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
9237 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9238 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm12[2,1,2,3]
9239 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
9240 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
9241 ; SSE-NEXT: andps %xmm14, %xmm0
9242 ; SSE-NEXT: orps %xmm4, %xmm0
9243 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9244 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9245 ; SSE-NEXT: # xmm2 = mem[0,1,0,1]
9246 ; SSE-NEXT: movdqa %xmm3, %xmm4
9247 ; SSE-NEXT: pandn %xmm2, %xmm4
9248 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
9249 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9250 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9251 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
9252 ; SSE-NEXT: pand %xmm3, %xmm2
9253 ; SSE-NEXT: por %xmm4, %xmm2
9254 ; SSE-NEXT: movdqa %xmm14, %xmm4
9255 ; SSE-NEXT: pandn %xmm2, %xmm4
9256 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9257 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9258 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,4,7]
9259 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9260 ; SSE-NEXT: # xmm2 = mem[2,2,3,3]
9261 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm2[1]
9262 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9263 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9264 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9265 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
9266 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9267 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
9268 ; SSE-NEXT: andps %xmm14, %xmm0
9269 ; SSE-NEXT: orps %xmm4, %xmm0
9270 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9271 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9272 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
9273 ; SSE-NEXT: movdqa %xmm3, %xmm6
9274 ; SSE-NEXT: pandn %xmm4, %xmm6
9275 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9276 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9277 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9278 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
9279 ; SSE-NEXT: pand %xmm3, %xmm4
9280 ; SSE-NEXT: por %xmm6, %xmm4
9281 ; SSE-NEXT: movdqa %xmm14, %xmm6
9282 ; SSE-NEXT: pandn %xmm4, %xmm6
9283 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9284 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
9285 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
9286 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9287 ; SSE-NEXT: # xmm4 = mem[2,2,3,3]
9288 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
9289 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9290 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9291 ; SSE-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
9292 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,1,2,3]
9293 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
9294 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3]
9295 ; SSE-NEXT: andps %xmm14, %xmm0
9296 ; SSE-NEXT: orps %xmm6, %xmm0
9297 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9298 ; SSE-NEXT: pshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9299 ; SSE-NEXT: # xmm4 = mem[0,1,0,1]
9300 ; SSE-NEXT: movdqa %xmm3, %xmm6
9301 ; SSE-NEXT: pandn %xmm4, %xmm6
9302 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9303 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9304 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9305 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
9306 ; SSE-NEXT: pand %xmm3, %xmm4
9307 ; SSE-NEXT: por %xmm6, %xmm4
9308 ; SSE-NEXT: movdqa %xmm14, %xmm6
9309 ; SSE-NEXT: pandn %xmm4, %xmm6
9310 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9311 ; SSE-NEXT: # xmm4 = mem[0,1,0,3]
9312 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,4,7]
9313 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9314 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
9315 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
9316 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9317 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9318 ; SSE-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
9319 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[2,1,2,3]
9320 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
9321 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm10[0],xmm0[1,2,3]
9322 ; SSE-NEXT: andps %xmm14, %xmm0
9323 ; SSE-NEXT: orps %xmm6, %xmm0
9324 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9325 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm15[0,1,0,1]
9326 ; SSE-NEXT: movdqa %xmm3, %xmm10
9327 ; SSE-NEXT: pandn %xmm6, %xmm10
9328 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9329 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9330 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
9331 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
9332 ; SSE-NEXT: pand %xmm3, %xmm6
9333 ; SSE-NEXT: por %xmm10, %xmm6
9334 ; SSE-NEXT: movdqa %xmm14, %xmm11
9335 ; SSE-NEXT: pandn %xmm6, %xmm11
9336 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9337 ; SSE-NEXT: # xmm6 = mem[0,1,0,3]
9338 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,4,5,4,7]
9339 ; SSE-NEXT: pshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9340 ; SSE-NEXT: # xmm6 = mem[2,2,3,3]
9341 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1]
9342 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9343 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9344 ; SSE-NEXT: # xmm10 = xmm10[0],mem[0],xmm10[1],mem[1],xmm10[2],mem[2],xmm10[3],mem[3]
9345 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm10[2,1,2,3]
9346 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
9347 ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm6[0],xmm0[1,2,3]
9348 ; SSE-NEXT: andps %xmm14, %xmm0
9349 ; SSE-NEXT: orps %xmm11, %xmm0
9350 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9351 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9352 ; SSE-NEXT: movdqa %xmm0, %xmm6
9353 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9354 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
9355 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
9356 ; SSE-NEXT: pand %xmm3, %xmm6
9357 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9358 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
9359 ; SSE-NEXT: pandn %xmm11, %xmm3
9360 ; SSE-NEXT: por %xmm6, %xmm3
9361 ; SSE-NEXT: movdqa %xmm14, %xmm6
9362 ; SSE-NEXT: pandn %xmm3, %xmm6
9363 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9364 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,1,0,3]
9365 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5,4,7]
9366 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9367 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[2,2,3,3]
9368 ; SSE-NEXT: punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm3[1]
9369 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9370 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9371 ; SSE-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
9372 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[2,1,2,3]
9373 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
9374 ; SSE-NEXT: movss {{.*#+}} xmm5 = xmm11[0],xmm5[1,2,3]
9375 ; SSE-NEXT: andps %xmm14, %xmm5
9376 ; SSE-NEXT: orps %xmm6, %xmm5
9377 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9378 ; SSE-NEXT: movdqa %xmm7, %xmm6
9379 ; SSE-NEXT: pandn %xmm13, %xmm6
9380 ; SSE-NEXT: movdqa %xmm15, %xmm11
9381 ; SSE-NEXT: pand %xmm7, %xmm11
9382 ; SSE-NEXT: por %xmm6, %xmm11
9383 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm11[0,1,1,0,4,5,6,7]
9384 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
9385 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
9386 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9387 ; SSE-NEXT: movss {{.*#+}} xmm6 = xmm3[0],xmm6[1,2,3]
9388 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
9389 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
9390 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
9391 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
9392 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
9393 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
9394 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
9395 ; SSE-NEXT: movdqa %xmm14, %xmm0
9396 ; SSE-NEXT: pandn %xmm3, %xmm0
9397 ; SSE-NEXT: andps %xmm14, %xmm6
9398 ; SSE-NEXT: por %xmm6, %xmm0
9399 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; SSE-NEXT: movdqa %xmm7, %xmm3
9401 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9402 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9403 ; SSE-NEXT: movdqa %xmm9, %xmm6
9404 ; SSE-NEXT: pand %xmm7, %xmm6
9405 ; SSE-NEXT: por %xmm3, %xmm6
9406 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm6[0,1,1,0,4,5,6,7]
9407 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
9408 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9409 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
9410 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
9411 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
9412 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9413 ; SSE-NEXT: movdqa %xmm0, %xmm5
9414 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9415 ; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
9416 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
9417 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
9418 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9419 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9420 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
9421 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
9422 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
9423 ; SSE-NEXT: movdqa %xmm14, %xmm1
9424 ; SSE-NEXT: pandn %xmm5, %xmm1
9425 ; SSE-NEXT: andps %xmm14, %xmm3
9426 ; SSE-NEXT: por %xmm3, %xmm1
9427 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9428 ; SSE-NEXT: movdqa %xmm7, %xmm3
9429 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9430 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9431 ; SSE-NEXT: pand %xmm7, %xmm5
9432 ; SSE-NEXT: por %xmm3, %xmm5
9433 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
9434 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
9435 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9436 ; SSE-NEXT: # xmm5 = mem[0,1,2,3,6,5,6,7]
9437 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
9438 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3]
9439 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9440 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9441 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9442 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
9443 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
9444 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9445 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
9446 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
9447 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
9448 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,4,7]
9449 ; SSE-NEXT: movdqa %xmm14, %xmm1
9450 ; SSE-NEXT: pandn %xmm5, %xmm1
9451 ; SSE-NEXT: andps %xmm14, %xmm3
9452 ; SSE-NEXT: por %xmm3, %xmm1
9453 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9454 ; SSE-NEXT: movdqa %xmm7, %xmm3
9455 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9456 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9457 ; SSE-NEXT: pand %xmm7, %xmm5
9458 ; SSE-NEXT: por %xmm3, %xmm5
9459 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,0,4,5,6,7]
9460 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
9461 ; SSE-NEXT: pshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9462 ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
9463 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
9464 ; SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
9465 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9466 ; SSE-NEXT: movdqa %xmm11, %xmm1
9467 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9468 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
9469 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
9470 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,7]
9471 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9472 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
9473 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
9474 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
9475 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7]
9476 ; SSE-NEXT: movdqa %xmm14, %xmm13
9477 ; SSE-NEXT: pandn %xmm1, %xmm13
9478 ; SSE-NEXT: andps %xmm14, %xmm3
9479 ; SSE-NEXT: por %xmm3, %xmm13
9480 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9481 ; SSE-NEXT: movdqa %xmm7, %xmm1
9482 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9483 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9484 ; SSE-NEXT: pand %xmm7, %xmm3
9485 ; SSE-NEXT: por %xmm1, %xmm3
9486 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
9487 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9488 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,7]
9489 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
9490 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
9491 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9492 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9493 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
9494 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
9495 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,7]
9496 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9497 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9498 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
9499 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
9500 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,4,7]
9501 ; SSE-NEXT: movdqa %xmm14, %xmm13
9502 ; SSE-NEXT: pandn %xmm3, %xmm13
9503 ; SSE-NEXT: andps %xmm14, %xmm1
9504 ; SSE-NEXT: por %xmm1, %xmm13
9505 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9506 ; SSE-NEXT: movdqa %xmm7, %xmm1
9507 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9508 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9509 ; SSE-NEXT: pand %xmm7, %xmm3
9510 ; SSE-NEXT: por %xmm1, %xmm3
9511 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,1,0,4,5,6,7]
9512 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9513 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
9514 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
9515 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
9516 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
9517 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9518 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9519 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9520 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
9521 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9522 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9523 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9524 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
9525 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
9526 ; SSE-NEXT: movdqa %xmm14, %xmm3
9527 ; SSE-NEXT: pandn %xmm2, %xmm3
9528 ; SSE-NEXT: andps %xmm14, %xmm1
9529 ; SSE-NEXT: por %xmm1, %xmm3
9530 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9531 ; SSE-NEXT: movdqa %xmm7, %xmm1
9532 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9533 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9534 ; SSE-NEXT: pand %xmm7, %xmm2
9535 ; SSE-NEXT: por %xmm1, %xmm2
9536 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,0,4,5,6,7]
9537 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9538 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,6,5,6,7]
9539 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
9540 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
9541 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9542 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9543 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9544 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9545 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
9546 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9547 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
9548 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9549 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
9550 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
9551 ; SSE-NEXT: movdqa %xmm14, %xmm10
9552 ; SSE-NEXT: pandn %xmm2, %xmm10
9553 ; SSE-NEXT: andps %xmm14, %xmm1
9554 ; SSE-NEXT: por %xmm1, %xmm10
9555 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9556 ; SSE-NEXT: movdqa %xmm7, %xmm1
9557 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9558 ; SSE-NEXT: pand %xmm7, %xmm12
9559 ; SSE-NEXT: por %xmm1, %xmm12
9560 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,1,1,0,4,5,6,7]
9561 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
9562 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
9563 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
9564 ; SSE-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
9565 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9566 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9567 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9568 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9569 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,7]
9570 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9571 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9572 ; SSE-NEXT: andps %xmm14, %xmm1
9573 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
9574 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
9575 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,4,7]
9576 ; SSE-NEXT: pandn %xmm2, %xmm14
9577 ; SSE-NEXT: por %xmm1, %xmm14
9578 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9579 ; SSE-NEXT: movdqa %xmm2, %xmm1
9580 ; SSE-NEXT: psrld $16, %xmm1
9581 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9582 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
9583 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9584 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
9585 ; SSE-NEXT: movdqa %xmm2, %xmm4
9586 ; SSE-NEXT: movdqa %xmm10, %xmm2
9587 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
9588 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9589 ; SSE-NEXT: psrlq $48, %xmm2
9590 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9591 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9592 ; SSE-NEXT: psrlq $16, %xmm2
9593 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9594 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9595 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9596 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9597 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9598 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9599 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9600 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9601 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9602 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9603 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9604 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9605 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9606 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9607 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7]
9608 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
9609 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9610 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9611 ; SSE-NEXT: movdqa %xmm13, %xmm1
9612 ; SSE-NEXT: psrld $16, %xmm1
9613 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9614 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
9615 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9616 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[2,3,2,3]
9617 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
9618 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9619 ; SSE-NEXT: psrlq $48, %xmm2
9620 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9621 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9622 ; SSE-NEXT: psrlq $16, %xmm2
9623 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
9624 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9625 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9626 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,3]
9627 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9628 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
9629 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9630 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9631 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9632 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9633 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9634 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9635 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9636 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9637 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9638 ; SSE-NEXT: movdqa %xmm0, %xmm1
9639 ; SSE-NEXT: psrld $16, %xmm1
9640 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9641 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
9642 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9643 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
9644 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9645 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9646 ; SSE-NEXT: psrlq $48, %xmm2
9647 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9648 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9649 ; SSE-NEXT: psrlq $16, %xmm2
9650 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9651 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9652 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9653 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9654 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9655 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9656 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9657 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9658 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9659 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9660 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9661 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9662 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9663 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9664 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9665 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9666 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9667 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9668 ; SSE-NEXT: movdqa %xmm0, %xmm1
9669 ; SSE-NEXT: psrld $16, %xmm1
9670 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9671 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
9672 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9673 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
9674 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9675 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9676 ; SSE-NEXT: psrlq $48, %xmm2
9677 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9678 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9679 ; SSE-NEXT: psrlq $16, %xmm2
9680 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9681 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9682 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9683 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9684 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,0,3]
9685 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9686 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
9687 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
9688 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9689 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9690 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9691 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9692 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9693 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9695 ; SSE-NEXT: movdqa %xmm2, %xmm1
9696 ; SSE-NEXT: psrld $16, %xmm1
9697 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9698 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
9699 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9700 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
9701 ; SSE-NEXT: movdqa %xmm2, %xmm4
9702 ; SSE-NEXT: movdqa %xmm0, %xmm2
9703 ; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
9704 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9705 ; SSE-NEXT: psrlq $48, %xmm2
9706 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9707 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9708 ; SSE-NEXT: psrlq $16, %xmm2
9709 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9710 ; SSE-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
9711 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9712 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9713 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9714 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9715 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9716 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm8[2],xmm2[3],xmm8[3]
9717 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9718 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9719 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9720 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9721 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9722 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9723 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9724 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9725 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9726 ; SSE-NEXT: movdqa %xmm12, %xmm13
9727 ; SSE-NEXT: psrld $16, %xmm13
9728 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9729 ; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
9730 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,3,2,3]
9731 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
9732 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9733 ; SSE-NEXT: psrlq $48, %xmm2
9734 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9735 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9736 ; SSE-NEXT: psrlq $16, %xmm2
9737 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9738 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3]
9739 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9740 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9741 ; SSE-NEXT: pshufd $196, (%rsp), %xmm2 # 16-byte Folded Reload
9742 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9743 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9744 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9745 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9746 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9747 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9748 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9749 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9750 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9751 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9752 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9753 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9754 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9755 ; SSE-NEXT: movdqa %xmm0, %xmm12
9756 ; SSE-NEXT: psrld $16, %xmm12
9757 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9758 ; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
9759 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
9760 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9761 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9762 ; SSE-NEXT: psrlq $48, %xmm2
9763 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9764 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9765 ; SSE-NEXT: movdqa %xmm9, %xmm2
9766 ; SSE-NEXT: psrlq $16, %xmm2
9767 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9768 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
9769 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9770 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9771 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9772 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
9773 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9774 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9775 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9776 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
9777 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9778 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9779 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9780 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9781 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9782 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9783 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9784 ; SSE-NEXT: movdqa %xmm2, %xmm11
9785 ; SSE-NEXT: psrld $16, %xmm11
9786 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9787 ; SSE-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
9788 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
9789 ; SSE-NEXT: movdqa %xmm2, %xmm3
9790 ; SSE-NEXT: movdqa %xmm0, %xmm2
9791 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
9792 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9793 ; SSE-NEXT: psrlq $48, %xmm2
9794 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9795 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9796 ; SSE-NEXT: movdqa %xmm4, %xmm2
9797 ; SSE-NEXT: psrlq $16, %xmm2
9798 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9799 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
9800 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,1,1]
9801 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9802 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9803 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3]
9804 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
9805 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9806 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
9807 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9808 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
9809 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
9810 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
9811 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
9812 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,5,6,4,7]
9813 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
9814 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9815 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9816 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9817 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9818 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9819 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9820 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9821 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9822 ; SSE-NEXT: movdqa %xmm7, %xmm1
9823 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9824 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9825 ; SSE-NEXT: pand %xmm7, %xmm2
9826 ; SSE-NEXT: por %xmm1, %xmm2
9827 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9828 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9829 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9830 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9831 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9832 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
9833 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9834 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9835 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9836 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9837 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9838 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9839 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9840 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9841 ; SSE-NEXT: movdqa %xmm7, %xmm1
9842 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9843 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9844 ; SSE-NEXT: pand %xmm7, %xmm2
9845 ; SSE-NEXT: por %xmm1, %xmm2
9846 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9847 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9848 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9849 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9850 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9851 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
9852 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9853 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9854 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9855 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9856 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9857 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9858 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9859 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9860 ; SSE-NEXT: movdqa %xmm7, %xmm1
9861 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9862 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9863 ; SSE-NEXT: pand %xmm7, %xmm2
9864 ; SSE-NEXT: por %xmm1, %xmm2
9865 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9866 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9867 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9868 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9869 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9870 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
9871 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9872 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9873 ; SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9874 ; SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
9875 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9876 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9877 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9878 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9879 ; SSE-NEXT: movdqa %xmm7, %xmm1
9880 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9881 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9882 ; SSE-NEXT: pand %xmm7, %xmm2
9883 ; SSE-NEXT: por %xmm1, %xmm2
9884 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9885 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9886 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9887 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9888 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9889 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
9890 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9891 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9892 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3]
9893 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9894 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9895 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
9896 ; SSE-NEXT: movdqa %xmm7, %xmm1
9897 ; SSE-NEXT: pandn (%rsp), %xmm1 # 16-byte Folded Reload
9898 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9899 ; SSE-NEXT: pand %xmm7, %xmm2
9900 ; SSE-NEXT: por %xmm1, %xmm2
9901 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9902 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9903 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9904 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9905 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9906 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2]
9907 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9908 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,5,6,7]
9909 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9910 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
9911 ; SSE-NEXT: movdqa %xmm7, %xmm1
9912 ; SSE-NEXT: pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9913 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9914 ; SSE-NEXT: pand %xmm7, %xmm2
9915 ; SSE-NEXT: por %xmm1, %xmm2
9916 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,3]
9917 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9918 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9919 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9920 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9921 ; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2]
9922 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm15[0],xmm4[1],xmm15[1],xmm4[2],xmm15[2],xmm4[3],xmm15[3]
9923 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,6,5,6,7]
9924 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9925 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
9926 ; SSE-NEXT: movdqa %xmm7, %xmm1
9927 ; SSE-NEXT: pandn %xmm5, %xmm1
9928 ; SSE-NEXT: pand %xmm7, %xmm3
9929 ; SSE-NEXT: por %xmm1, %xmm3
9930 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,3]
9931 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,7]
9932 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
9933 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
9934 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2]
9935 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9936 ; SSE-NEXT: movdqa %xmm3, %xmm1
9937 ; SSE-NEXT: pand %xmm7, %xmm1
9938 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9939 ; SSE-NEXT: pandn %xmm5, %xmm7
9940 ; SSE-NEXT: por %xmm1, %xmm7
9941 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9942 ; SSE-NEXT: movdqa %xmm4, %xmm1
9943 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9944 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
9945 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
9946 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
9947 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9948 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
9949 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,3]
9950 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,7]
9951 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9952 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
9953 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
9954 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[0,2]
9955 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9956 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
9957 ; SSE-NEXT: movdqa %xmm4, %xmm1
9958 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9959 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9960 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
9961 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9962 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
9963 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
9964 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9965 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[1,1,1,1,4,5,6,7]
9966 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
9967 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,3]
9968 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm5[2,2,2,2]
9969 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
9970 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
9971 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
9972 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9973 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
9974 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9975 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9976 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9977 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
9978 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9979 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
9980 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
9981 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9982 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9983 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
9984 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9985 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
9986 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9987 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
9988 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
9989 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
9990 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
9991 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
9992 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9993 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
9994 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9995 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
9996 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9997 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
9998 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9999 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
10000 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
10001 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10002 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10003 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
10004 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10005 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10006 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10007 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
10008 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[2,2,2,2]
10009 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10010 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm0[2],xmm8[3],xmm0[3]
10011 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm1[0],xmm8[1]
10012 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10013 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
10014 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10015 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10016 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10017 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
10018 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10019 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
10020 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
10021 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10022 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10023 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
10024 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10025 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10026 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10027 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
10028 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
10029 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10030 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
10031 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm1[0],xmm7[1]
10032 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10033 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
10034 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10035 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10036 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10037 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
10038 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10039 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
10040 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
10041 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10042 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10043 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
10044 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10045 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10046 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10047 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
10048 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,2,2,2]
10049 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10050 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
10051 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
10052 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10053 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
10054 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10055 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10056 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10057 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
10058 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10059 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
10060 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
10061 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10062 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10063 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
10064 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
10065 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
10066 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10067 ; SSE-NEXT: # xmm0 = mem[0,1,0,3]
10068 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,2,2,2]
10069 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
10070 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
10071 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
10072 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10073 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
10074 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10075 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10076 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10077 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
10078 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10079 ; SSE-NEXT: # xmm1 = mem[2,1,2,3]
10080 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
10081 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
10082 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10083 ; SSE-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
10084 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10085 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
10086 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10087 ; SSE-NEXT: # xmm2 = mem[0,1,0,3]
10088 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,2,2,2]
10089 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10090 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3]
10091 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
10092 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
10093 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10094 ; SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
10095 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10096 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
10097 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10098 ; SSE-NEXT: # xmm2 = mem[2,1,2,3]
10099 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
10100 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
10101 ; SSE-NEXT: pshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10102 ; SSE-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
10103 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10104 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10105 ; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10106 ; SSE-NEXT: # xmm3 = mem[0,1,0,3]
10107 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
10108 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,4,7]
10109 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
10110 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
10111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10112 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
10113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10114 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
10115 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10116 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
10117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10118 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
10119 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10120 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
10121 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10122 ; SSE-NEXT: movaps %xmm1, (%rsi)
10123 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10124 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
10125 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10126 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
10127 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10128 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
10129 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10130 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
10131 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10132 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
10133 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10134 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
10135 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10136 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
10137 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10138 ; SSE-NEXT: movaps %xmm1, (%rdx)
10139 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10140 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
10141 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10142 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
10143 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10144 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
10145 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10146 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
10147 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10148 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
10149 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10150 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
10151 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10152 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
10153 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10154 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
10155 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10156 ; SSE-NEXT: movaps %xmm1, (%rcx)
10157 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10158 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
10159 ; SSE-NEXT: movdqa %xmm14, 112(%r8)
10160 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10161 ; SSE-NEXT: movaps %xmm1, 96(%r8)
10162 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10163 ; SSE-NEXT: movaps %xmm1, 80(%r8)
10164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10165 ; SSE-NEXT: movaps %xmm1, 64(%r8)
10166 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10167 ; SSE-NEXT: movaps %xmm1, 48(%r8)
10168 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10169 ; SSE-NEXT: movaps %xmm1, 32(%r8)
10170 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10171 ; SSE-NEXT: movaps %xmm1, 16(%r8)
10172 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10173 ; SSE-NEXT: movaps %xmm1, (%r8)
10174 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10175 ; SSE-NEXT: movaps %xmm1, 112(%r9)
10176 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10177 ; SSE-NEXT: movaps %xmm1, 96(%r9)
10178 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10179 ; SSE-NEXT: movaps %xmm1, 80(%r9)
10180 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10181 ; SSE-NEXT: movaps %xmm1, 64(%r9)
10182 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10183 ; SSE-NEXT: movaps %xmm1, 48(%r9)
10184 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10185 ; SSE-NEXT: movaps %xmm1, 32(%r9)
10186 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10187 ; SSE-NEXT: movaps %xmm1, 16(%r9)
10188 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10189 ; SSE-NEXT: movaps %xmm1, (%r9)
10190 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10191 ; SSE-NEXT: movaps %xmm11, 112(%rax)
10192 ; SSE-NEXT: movaps %xmm12, 96(%rax)
10193 ; SSE-NEXT: movaps %xmm13, 80(%rax)
10194 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10195 ; SSE-NEXT: movaps %xmm1, 64(%rax)
10196 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10197 ; SSE-NEXT: movaps %xmm1, 48(%rax)
10198 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10199 ; SSE-NEXT: movaps %xmm1, 32(%rax)
10200 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10201 ; SSE-NEXT: movaps %xmm1, 16(%rax)
10202 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10203 ; SSE-NEXT: movaps %xmm1, (%rax)
10204 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10205 ; SSE-NEXT: movapd %xmm0, 112(%rax)
10206 ; SSE-NEXT: movapd %xmm4, 96(%rax)
10207 ; SSE-NEXT: movapd %xmm5, 80(%rax)
10208 ; SSE-NEXT: movapd %xmm6, 64(%rax)
10209 ; SSE-NEXT: movapd %xmm7, 48(%rax)
10210 ; SSE-NEXT: movapd %xmm8, 32(%rax)
10211 ; SSE-NEXT: movapd %xmm9, 16(%rax)
10212 ; SSE-NEXT: movapd %xmm10, (%rax)
10213 ; SSE-NEXT: addq $1352, %rsp # imm = 0x548
10216 ; AVX-LABEL: load_i16_stride7_vf64:
10218 ; AVX-NEXT: subq $1544, %rsp # imm = 0x608
10219 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm0
10220 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10221 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
10222 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
10223 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10224 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
10225 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10226 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
10227 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10228 ; AVX-NEXT: vpsrlq $16, %xmm1, %xmm1
10229 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm2
10230 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10231 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
10232 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
10233 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm2
10234 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10235 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm1
10236 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10237 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10238 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
10239 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
10240 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5],xmm1[6,7]
10241 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm7
10242 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
10243 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10244 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
10245 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10246 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm2
10247 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10248 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
10249 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
10250 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
10251 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10252 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
10253 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
10254 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10255 ; AVX-NEXT: vmovdqa (%rdi), %xmm3
10256 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10257 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
10258 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10259 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
10260 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
10261 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
10262 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
10263 ; AVX-NEXT: vmovaps 32(%rdi), %xmm3
10264 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10265 ; AVX-NEXT: vmovaps 48(%rdi), %xmm6
10266 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[2],xmm6[2],zero
10267 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10268 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
10269 ; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
10270 ; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
10271 ; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
10272 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
10273 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10274 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
10275 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10276 ; AVX-NEXT: vmovdqa 624(%rdi), %xmm1
10277 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10278 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
10279 ; AVX-NEXT: vmovdqa 608(%rdi), %xmm2
10280 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
10281 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
10282 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10283 ; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
10284 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10285 ; AVX-NEXT: vpsrlq $16, %xmm2, %xmm2
10286 ; AVX-NEXT: vmovdqa 592(%rdi), %xmm3
10287 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10288 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
10289 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
10290 ; AVX-NEXT: vmovdqa 656(%rdi), %xmm2
10291 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10292 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm3
10293 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10294 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
10295 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10296 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10297 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
10298 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm2
10299 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10300 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
10301 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
10302 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm3
10303 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10304 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
10305 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10306 ; AVX-NEXT: vmovaps 480(%rdi), %xmm5
10307 ; AVX-NEXT: vmovaps 496(%rdi), %xmm15
10308 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[2],xmm15[2],zero
10309 ; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10310 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10311 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
10312 ; AVX-NEXT: vmovdqa 528(%rdi), %xmm4
10313 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10314 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm3
10315 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10316 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
10317 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm4
10318 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10319 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
10320 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
10321 ; AVX-NEXT: vmovdqa 560(%rdi), %xmm4
10322 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10323 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
10324 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
10325 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
10326 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2
10327 ; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3
10328 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
10329 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10330 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
10331 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10332 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm1
10333 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10334 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
10335 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm2
10336 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10337 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
10338 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10339 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
10340 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10341 ; AVX-NEXT: vpsrlq $16, %xmm2, %xmm2
10342 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm3
10343 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10344 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
10345 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
10346 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm3
10347 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10348 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm2
10349 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10350 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10351 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10352 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10353 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
10354 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm10
10355 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,3]
10356 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10357 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
10358 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm12
10359 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,2,3,3]
10360 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10361 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10362 ; AVX-NEXT: vmovaps 256(%rdi), %xmm9
10363 ; AVX-NEXT: vmovaps 272(%rdi), %xmm3
10364 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10365 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm9[2],xmm3[2],zero
10366 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10367 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
10368 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm4
10369 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10370 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
10371 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10372 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
10373 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm4
10374 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10375 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
10376 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
10377 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm11
10378 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,3,2,3]
10379 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10380 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
10381 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
10382 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2
10383 ; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3
10384 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
10385 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10386 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
10387 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10388 ; AVX-NEXT: vmovdqa 848(%rdi), %xmm1
10389 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10390 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
10391 ; AVX-NEXT: vmovdqa 832(%rdi), %xmm2
10392 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10393 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,3,3]
10394 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10395 ; AVX-NEXT: vmovdqa 800(%rdi), %xmm2
10396 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10397 ; AVX-NEXT: vpsrlq $16, %xmm2, %xmm2
10398 ; AVX-NEXT: vmovdqa 816(%rdi), %xmm3
10399 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10400 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
10401 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
10402 ; AVX-NEXT: vmovdqa 880(%rdi), %xmm2
10403 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10404 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm13
10405 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
10406 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10407 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10408 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10409 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
10410 ; AVX-NEXT: vmovdqa 672(%rdi), %xmm2
10411 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10412 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
10413 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
10414 ; AVX-NEXT: vmovdqa 688(%rdi), %xmm3
10415 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10416 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
10417 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
10418 ; AVX-NEXT: vmovaps 704(%rdi), %xmm14
10419 ; AVX-NEXT: vmovaps 720(%rdi), %xmm3
10420 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10421 ; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm14[2],xmm3[2],zero
10422 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10423 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
10424 ; AVX-NEXT: vmovdqa 752(%rdi), %xmm4
10425 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10426 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm3
10427 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10428 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
10429 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm4
10430 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10431 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
10432 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm4[7]
10433 ; AVX-NEXT: vmovdqa 784(%rdi), %xmm4
10434 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10435 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
10436 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
10437 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
10438 ; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2
10439 ; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm0
10440 ; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
10441 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10442 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
10443 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10444 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10445 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10446 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10447 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
10448 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
10449 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10450 ; AVX-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10451 ; AVX-NEXT: # xmm1 = mem[0,1,2,3,4,5],xmm1[6],mem[7]
10452 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
10453 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,2,4,5,6,7]
10454 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
10455 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10456 ; AVX-NEXT: vpslld $16, %xmm1, %xmm1
10457 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10458 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10459 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
10460 ; AVX-NEXT: vpsrld $16, %xmm7, %xmm1
10461 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10462 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10463 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10464 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10465 ; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5]
10466 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
10467 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10468 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
10469 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
10470 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10471 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
10472 ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
10473 ; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10474 ; AVX-NEXT: vmovdqa %xmm3, %xmm4
10475 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10476 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload
10477 ; AVX-NEXT: # xmm3 = mem[0],xmm6[1],mem[2,3,4,5,6,7]
10478 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
10479 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,3,4,5,6,7]
10480 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
10481 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10482 ; AVX-NEXT: vandnps %ymm2, %ymm1, %ymm2
10483 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3
10484 ; AVX-NEXT: vmovaps %ymm1, %ymm8
10485 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
10486 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
10487 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10488 ; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3
10489 ; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2
10490 ; AVX-NEXT: vorps %ymm3, %ymm2, %ymm0
10491 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10492 ; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
10493 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10494 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10495 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
10496 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
10497 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10498 ; AVX-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10499 ; AVX-NEXT: # xmm3 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
10500 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
10501 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
10502 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
10503 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10504 ; AVX-NEXT: vpslld $16, %xmm0, %xmm3
10505 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10506 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
10507 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
10508 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
10509 ; AVX-NEXT: vmovdqa %xmm4, %xmm1
10510 ; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
10511 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10512 ; AVX-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
10513 ; AVX-NEXT: # xmm4 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
10514 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
10515 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
10516 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
10517 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10518 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
10519 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10520 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10521 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10522 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10523 ; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
10524 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
10525 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10526 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm5
10527 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
10528 ; AVX-NEXT: vandps %ymm3, %ymm8, %ymm3
10529 ; AVX-NEXT: vandnps %ymm4, %ymm8, %ymm4
10530 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
10531 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10532 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
10533 ; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3
10534 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm0
10535 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10536 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10537 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10538 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10539 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
10540 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
10541 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10542 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10543 ; AVX-NEXT: # xmm3 = xmm0[0,1,2,3,4,5],mem[6],xmm0[7]
10544 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
10545 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
10546 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
10547 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10548 ; AVX-NEXT: vpslld $16, %xmm0, %xmm3
10549 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10550 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
10551 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
10552 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload
10553 ; AVX-NEXT: # xmm3 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
10554 ; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
10555 ; AVX-NEXT: vmovdqa %xmm1, %xmm15
10556 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7]
10557 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
10558 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
10559 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
10560 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10561 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
10562 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10563 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10564 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10565 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10566 ; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
10567 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
10568 ; AVX-NEXT: vpsrld $16, %xmm11, %xmm5
10569 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
10570 ; AVX-NEXT: vandps %ymm3, %ymm8, %ymm3
10571 ; AVX-NEXT: vandnps %ymm4, %ymm8, %ymm4
10572 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
10573 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10574 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
10575 ; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3
10576 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm0
10577 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10578 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10579 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10580 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
10581 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
10582 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
10583 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10584 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10585 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1,2,3,4,5],xmm0[6],xmm10[7]
10586 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
10587 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,2,4,5,6,7]
10588 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
10589 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10590 ; AVX-NEXT: vpslld $16, %xmm8, %xmm3
10591 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
10592 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
10593 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10594 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
10595 ; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3
10596 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10597 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10598 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm15[1],xmm11[2,3,4,5,6,7]
10599 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
10600 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,0,3,3,4,5,6,7]
10601 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
10602 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10603 ; AVX-NEXT: vpsrld $16, %xmm14, %xmm4
10604 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10605 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm12[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10606 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10607 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10608 ; AVX-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4,5]
10609 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7]
10610 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10611 ; AVX-NEXT: vpsrld $16, %xmm5, %xmm5
10612 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
10613 ; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10614 ; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3
10615 ; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4
10616 ; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
10617 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10618 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
10619 ; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3
10620 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
10621 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10622 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10623 ; AVX-NEXT: vpsllq $16, %xmm2, %xmm2
10624 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10625 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
10626 ; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10627 ; AVX-NEXT: # xmm3 = mem[0,3,2,3]
10628 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
10629 ; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10630 ; AVX-NEXT: # xmm3 = mem[0,1],xmm3[2,3],mem[4,5,6,7]
10631 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
10632 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10633 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm3[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10634 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10635 ; AVX-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
10636 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7]
10637 ; AVX-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10638 ; AVX-NEXT: # xmm2 = mem[2,2,2,2]
10639 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10640 ; AVX-NEXT: # xmm2 = mem[0,1,2,3,4,5],xmm2[6,7]
10641 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10642 ; AVX-NEXT: # xmm4 = mem[0,1,0,1]
10643 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm4[7]
10644 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10645 ; AVX-NEXT: # xmm4 = mem[1,1,1,1]
10646 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm4
10647 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10648 ; AVX-NEXT: # xmm2 = mem[0,1,0,3]
10649 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10650 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10651 ; AVX-NEXT: # xmm5 = mem[2,2,3,3]
10652 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm2[1],xmm5[1]
10653 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
10654 ; AVX-NEXT: # xmm2 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3]
10655 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
10656 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
10657 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
10658 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10659 ; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4
10660 ; AVX-NEXT: vandps %ymm6, %ymm5, %ymm5
10661 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4
10662 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
10663 ; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3
10664 ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
10665 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
10666 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10667 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10668 ; AVX-NEXT: vpsllq $16, %xmm3, %xmm3
10669 ; AVX-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
10670 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
10671 ; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10672 ; AVX-NEXT: # xmm4 = mem[0,3,2,3]
10673 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
10674 ; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10675 ; AVX-NEXT: # xmm4 = mem[0,1],xmm4[2,3],mem[4,5,6,7]
10676 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
10677 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10678 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10679 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10680 ; AVX-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
10681 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
10682 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10683 ; AVX-NEXT: # xmm4 = mem[0,1,0,3]
10684 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
10685 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10686 ; AVX-NEXT: # xmm5 = mem[2,2,3,3]
10687 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm4[1],xmm5[1]
10688 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10689 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10690 ; AVX-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
10691 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,2,3]
10692 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
10693 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3,4,5,6,7]
10694 ; AVX-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10695 ; AVX-NEXT: # xmm6 = mem[2,2,2,2]
10696 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
10697 ; AVX-NEXT: # xmm6 = mem[0,1,2,3,4,5],xmm6[6,7]
10698 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10699 ; AVX-NEXT: # xmm7 = mem[0,1,0,1]
10700 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7]
10701 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10702 ; AVX-NEXT: # xmm7 = mem[1,1,1,1]
10703 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
10704 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10705 ; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
10706 ; AVX-NEXT: vandnps %ymm6, %ymm7, %ymm6
10707 ; AVX-NEXT: vorps %ymm6, %ymm5, %ymm5
10708 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
10709 ; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10710 ; AVX-NEXT: vandnps %ymm3, %ymm6, %ymm3
10711 ; AVX-NEXT: vandps %ymm6, %ymm5, %ymm5
10712 ; AVX-NEXT: vorps %ymm3, %ymm5, %ymm3
10713 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10714 ; AVX-NEXT: vpsllq $16, %xmm1, %xmm3
10715 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
10716 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,3]
10717 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
10718 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3],xmm10[4,5,6,7]
10719 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
10720 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10721 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10722 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3]
10723 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
10724 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10725 ; AVX-NEXT: # xmm5 = mem[0,1,0,3]
10726 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
10727 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[2,2,3,3]
10728 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
10729 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
10730 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10731 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
10732 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
10733 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4,5,6,7]
10734 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[2,2,2,2]
10735 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3,4,5],xmm7[6,7]
10736 ; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10737 ; AVX-NEXT: # xmm15 = mem[0,1,0,1]
10738 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6],xmm15[7]
10739 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10740 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
10741 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
10742 ; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10743 ; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
10744 ; AVX-NEXT: vandnps %ymm7, %ymm11, %ymm7
10745 ; AVX-NEXT: vorps %ymm7, %ymm5, %ymm5
10746 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
10747 ; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10748 ; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3
10749 ; AVX-NEXT: vandps %ymm5, %ymm9, %ymm5
10750 ; AVX-NEXT: vorps %ymm3, %ymm5, %ymm0
10751 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10752 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10753 ; AVX-NEXT: vpsllq $16, %xmm0, %xmm3
10754 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10755 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
10756 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10757 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,3,2,3]
10758 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
10759 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10760 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
10761 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
10762 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10763 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
10764 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10765 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
10766 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3,4,5],xmm5[6,7]
10767 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10768 ; AVX-NEXT: # xmm3 = mem[0,1,0,3]
10769 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
10770 ; AVX-NEXT: vpshufd $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
10771 ; AVX-NEXT: # xmm7 = mem[2,2,3,3]
10772 ; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm3[1],xmm7[1]
10773 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10774 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
10775 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
10776 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,2,3]
10777 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
10778 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3,4,5,6,7]
10779 ; AVX-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10780 ; AVX-NEXT: # xmm15 = mem[2,2,2,2]
10781 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10782 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm15[6,7]
10783 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10784 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,1,0,1]
10785 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3,4,5,6],xmm14[7]
10786 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10787 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
10788 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
10789 ; AVX-NEXT: vandps %ymm7, %ymm11, %ymm7
10790 ; AVX-NEXT: vandnps %ymm14, %ymm11, %ymm14
10791 ; AVX-NEXT: vorps %ymm7, %ymm14, %ymm7
10792 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
10793 ; AVX-NEXT: vandnps %ymm5, %ymm9, %ymm5
10794 ; AVX-NEXT: vandps %ymm7, %ymm9, %ymm7
10795 ; AVX-NEXT: vorps %ymm5, %ymm7, %ymm5
10796 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10797 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10798 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10799 ; AVX-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
10800 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
10801 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,3,3,4,5,6,7]
10802 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10803 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
10804 ; AVX-NEXT: # xmm7 = xmm7[0,1,2,3,4,5],mem[6],xmm7[7]
10805 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,0,0,4,5,6,7]
10806 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,6,6,7]
10807 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3,4,5,6,7]
10808 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10809 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
10810 ; AVX-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
10811 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10812 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,1,2,3,4,5,6,7]
10813 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
10814 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1,2,3,4,5],xmm14[6,7]
10815 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
10816 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
10817 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10818 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10819 ; AVX-NEXT: # xmm5 = mem[0],xmm5[1],mem[2,3,4,5,6,7]
10820 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,0,4,5,6,7]
10821 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
10822 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3,4,5,6,7]
10823 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10824 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10825 ; AVX-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
10826 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
10827 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,7]
10828 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
10829 ; AVX-NEXT: # xmm15 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
10830 ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,2,3,4,5,6,7,8,9,4,5,8,9,2,3]
10831 ; AVX-NEXT: vpshufb %xmm5, %xmm15, %xmm15
10832 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10833 ; AVX-NEXT: vpsrlq $48, %xmm7, %xmm12
10834 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm15, %ymm12
10835 ; AVX-NEXT: vandps %ymm2, %ymm11, %ymm2
10836 ; AVX-NEXT: vandnps %ymm12, %ymm11, %ymm12
10837 ; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
10838 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm12
10839 ; AVX-NEXT: vandnps %ymm12, %ymm9, %ymm12
10840 ; AVX-NEXT: vandps %ymm2, %ymm9, %ymm2
10841 ; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
10842 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10843 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10844 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10845 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
10846 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
10847 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,3,3,4,5,6,7]
10848 ; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
10849 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm12 # 16-byte Folded Reload
10850 ; AVX-NEXT: # xmm12 = xmm11[0,1,2,3,4,5],mem[6],xmm11[7]
10851 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
10852 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
10853 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm2[1,2],xmm12[3,4,5,6,7]
10854 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10855 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10856 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
10857 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,3,4,5,6,7]
10858 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
10859 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
10860 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
10861 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
10862 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10863 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload
10864 ; AVX-NEXT: # xmm14 = mem[0],xmm11[1],mem[2,3,4,5,6,7]
10865 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
10866 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
10867 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm14[2,3,4,5,6,7]
10868 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10869 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
10870 ; AVX-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
10871 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
10872 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
10873 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
10874 ; AVX-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
10875 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm14
10876 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10877 ; AVX-NEXT: vpsrlq $48, %xmm15, %xmm15
10878 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
10879 ; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10880 ; AVX-NEXT: vandps %ymm4, %ymm15, %ymm4
10881 ; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14
10882 ; AVX-NEXT: vorps %ymm4, %ymm14, %ymm4
10883 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
10884 ; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10885 ; AVX-NEXT: vandnps %ymm12, %ymm14, %ymm12
10886 ; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
10887 ; AVX-NEXT: vorps %ymm4, %ymm12, %ymm4
10888 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10889 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
10890 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
10891 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,3,3,4,5,6,7]
10892 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
10893 ; AVX-NEXT: # xmm12 = xmm13[0,1,2,3,4,5],mem[6],xmm13[7]
10894 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
10895 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
10896 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm4[1,2],xmm12[3,4,5,6,7]
10897 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10898 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
10899 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[2,1,2,3,4,5,6,7]
10900 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
10901 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
10902 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
10903 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
10904 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10905 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
10906 ; AVX-NEXT: # xmm14 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
10907 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
10908 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
10909 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3,4,5,6,7]
10910 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
10911 ; AVX-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10912 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
10913 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
10914 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
10915 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm14
10916 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10917 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm15
10918 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
10919 ; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,0,0,65535,65535,65535,65535,65535,65535,65535]
10920 ; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3
10921 ; AVX-NEXT: vandnps %ymm14, %ymm1, %ymm14
10922 ; AVX-NEXT: vorps %ymm3, %ymm14, %ymm3
10923 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
10924 ; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
10925 ; AVX-NEXT: vandnps %ymm12, %ymm13, %ymm12
10926 ; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3
10927 ; AVX-NEXT: vorps %ymm3, %ymm12, %ymm0
10928 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10929 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10930 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10931 ; AVX-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
10932 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
10933 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,3,3,4,5,6,7]
10934 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10935 ; AVX-NEXT: vpblendw $191, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10936 ; AVX-NEXT: # xmm12 = mem[0,1,2,3,4,5],xmm0[6],mem[7]
10937 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
10938 ; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,6,7]
10939 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm3[1,2],xmm12[3,4,5,6,7]
10940 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10941 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10942 ; AVX-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10943 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,1,2,3,4,5,6,7]
10944 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,0,0,0]
10945 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
10946 ; AVX-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10947 ; AVX-NEXT: # xmm6 = mem[0,1,2,3,6,5,6,7]
10948 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,2]
10949 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10950 ; AVX-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
10951 ; AVX-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3,4,5,6,7]
10952 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,1,0,4,5,6,7]
10953 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
10954 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm14[2,3,4,5,6,7]
10955 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10956 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
10957 ; AVX-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10958 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
10959 ; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,7]
10960 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
10961 ; AVX-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
10962 ; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm5
10963 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10964 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm14
10965 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
10966 ; AVX-NEXT: vandps %ymm1, %ymm6, %ymm6
10967 ; AVX-NEXT: vandnps %ymm5, %ymm1, %ymm1
10968 ; AVX-NEXT: vorps %ymm1, %ymm6, %ymm1
10969 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm5
10970 ; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5
10971 ; AVX-NEXT: vandps %ymm1, %ymm13, %ymm1
10972 ; AVX-NEXT: vorps %ymm5, %ymm1, %ymm0
10973 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10974 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
10975 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
10976 ; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10977 ; AVX-NEXT: # xmm5 = mem[0,3,2,3]
10978 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
10979 ; AVX-NEXT: vpunpckldq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
10980 ; AVX-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
10981 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3,4,5,6,7]
10982 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
10983 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
10984 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5],xmm2[6,7]
10985 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10986 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm1
10987 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10988 ; AVX-NEXT: # xmm5 = mem[2,3,2,3]
10989 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
10990 ; AVX-NEXT: vpsrlq $16, %xmm11, %xmm5
10991 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10992 ; AVX-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
10993 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3],xmm1[4,5,6,7]
10994 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10995 ; AVX-NEXT: # xmm1 = mem[0,1,0,3]
10996 ; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
10997 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10998 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10999 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
11000 ; AVX-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11001 ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,14,15,4,5,6,7,0,1,4,5,8,9,6,7]
11002 ; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm6
11003 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
11004 ; AVX-NEXT: # xmm12 = mem[2,3,2,3]
11005 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
11006 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
11007 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11008 ; AVX-NEXT: vandnps %ymm2, %ymm13, %ymm2
11009 ; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5
11010 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm0
11011 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11012 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11013 ; AVX-NEXT: # xmm2 = mem[1,1,1,1]
11014 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11015 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm10[2,3],xmm2[4,5,6,7]
11016 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11017 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[0,3,2,3]
11018 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
11019 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11020 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
11021 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
11022 ; AVX-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11023 ; AVX-NEXT: # xmm5 = mem[0,1,2,1]
11024 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
11025 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6,7]
11026 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11027 ; AVX-NEXT: vpsrlq $48, %xmm5, %xmm5
11028 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
11029 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
11030 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11031 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11032 ; AVX-NEXT: vpsrlq $16, %xmm6, %xmm6
11033 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11034 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
11035 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
11036 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11037 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,1,0,3]
11038 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
11039 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11040 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
11041 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11042 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
11043 ; AVX-NEXT: vpshufb %xmm1, %xmm6, %xmm6
11044 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11045 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[2,3,2,3]
11046 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
11047 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
11048 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11049 ; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
11050 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
11051 ; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
11052 ; AVX-NEXT: vorps %ymm2, %ymm5, %ymm2
11053 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11054 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11055 ; AVX-NEXT: # xmm2 = mem[1,1,1,1]
11056 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11057 ; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
11058 ; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11059 ; AVX-NEXT: # xmm5 = mem[0,3,2,3]
11060 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
11061 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11062 ; AVX-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
11063 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3,4,5,6,7]
11064 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
11065 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
11066 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
11067 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11068 ; AVX-NEXT: vpsrlq $48, %xmm0, %xmm4
11069 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11070 ; AVX-NEXT: # xmm5 = mem[2,3,2,3]
11071 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11072 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11073 ; AVX-NEXT: vpsrlq $16, %xmm0, %xmm5
11074 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11075 ; AVX-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3]
11076 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
11077 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11078 ; AVX-NEXT: # xmm5 = mem[0,1,0,3]
11079 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
11080 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11081 ; AVX-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
11082 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11083 ; AVX-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
11084 ; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
11085 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
11086 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
11087 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
11088 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
11089 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11090 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
11091 ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
11092 ; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2
11093 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11094 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11095 ; AVX-NEXT: # xmm2 = mem[1,1,1,1]
11096 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11097 ; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
11098 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11099 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,3,2,3]
11100 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
11101 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11102 ; AVX-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
11103 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3,4,5,6,7]
11104 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
11105 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
11106 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
11107 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11108 ; AVX-NEXT: vpsrlq $48, %xmm3, %xmm3
11109 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11110 ; AVX-NEXT: # xmm4 = mem[2,3,2,3]
11111 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11112 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11113 ; AVX-NEXT: vpsrlq $16, %xmm4, %xmm4
11114 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11115 ; AVX-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3]
11116 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
11117 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11118 ; AVX-NEXT: # xmm4 = mem[0,1,0,3]
11119 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
11120 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11121 ; AVX-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
11122 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11123 ; AVX-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
11124 ; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
11125 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11126 ; AVX-NEXT: # xmm4 = mem[2,3,2,3]
11127 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
11128 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
11129 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11130 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
11131 ; AVX-NEXT: vandps %ymm7, %ymm1, %ymm1
11132 ; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
11133 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134 ; AVX-NEXT: vpsrlq $48, %xmm10, %xmm1
11135 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11136 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11137 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11138 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
11139 ; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,8,9,8,9,8,9,6,7,6,7,6,7,6,7]
11140 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
11141 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
11142 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11143 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11144 ; AVX-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
11145 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
11146 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
11147 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
11148 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11149 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
11150 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11151 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
11152 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11153 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
11154 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
11155 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
11156 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
11157 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
11158 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
11159 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
11160 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
11161 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
11162 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
11163 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11164 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
11165 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
11166 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11167 ; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
11168 ; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2
11169 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
11170 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11171 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11172 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm1
11173 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11174 ; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11175 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11176 ; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
11177 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11178 ; AVX-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
11179 ; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
11180 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
11181 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11182 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11183 ; AVX-NEXT: # xmm2 = xmm2[0,1,2,3,4,5],mem[6],xmm2[7]
11184 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
11185 ; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
11186 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
11187 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11188 ; AVX-NEXT: vpsrld $16, %xmm2, %xmm2
11189 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11190 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
11191 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11192 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11193 ; AVX-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
11194 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
11195 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,2,2]
11196 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7]
11197 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11198 ; AVX-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11199 ; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3,4,5,6,7]
11200 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
11201 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,7]
11202 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11203 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
11204 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
11205 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
11206 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11207 ; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11208 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
11209 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
11210 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11211 ; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
11212 ; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2
11213 ; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
11214 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11215 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11216 ; AVX-NEXT: vpsrlq $48, %xmm1, %xmm2
11217 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11218 ; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11219 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11220 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11221 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
11222 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
11223 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
11224 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
11225 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11226 ; AVX-NEXT: vpblendw $64, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
11227 ; AVX-NEXT: # xmm3 = xmm1[0,1,2,3,4,5],mem[6],xmm1[7]
11228 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
11229 ; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,7,6]
11230 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
11231 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11232 ; AVX-NEXT: vpsrld $16, %xmm1, %xmm3
11233 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11234 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
11235 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11236 ; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
11237 ; AVX-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
11238 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
11239 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,2]
11240 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
11241 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11242 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
11243 ; AVX-NEXT: # xmm4 = mem[0],xmm1[1],mem[2,3,4,5,6,7]
11244 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
11245 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,4,7,7]
11246 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11247 ; AVX-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
11248 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
11249 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
11250 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11251 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11252 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
11253 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7]
11254 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11255 ; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
11256 ; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3
11257 ; AVX-NEXT: vorps %ymm2, %ymm3, %ymm1
11258 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11259 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11260 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
11261 ; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
11262 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11263 ; AVX-NEXT: vpsrlq $48, %xmm14, %xmm4
11264 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11265 ; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11266 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
11267 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
11268 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11269 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
11270 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4,5],xmm2[6],xmm8[7]
11271 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
11272 ; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,7,6]
11273 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
11274 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11275 ; AVX-NEXT: vpsrld $16, %xmm0, %xmm4
11276 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11277 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7]
11278 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11279 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11280 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11281 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
11282 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,2]
11283 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7]
11284 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11285 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11286 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm13[1],xmm10[2,3,4,5,6,7]
11287 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
11288 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,4,7,7]
11289 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11290 ; AVX-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
11291 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
11292 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
11293 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11294 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
11295 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
11296 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7]
11297 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
11298 ; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3
11299 ; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
11300 ; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
11301 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11302 ; AVX-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
11303 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
11304 ; AVX-NEXT: # xmm4 = zero,xmm4[1],mem[0],zero
11305 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11306 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
11307 ; AVX-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
11308 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
11309 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11310 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
11311 ; AVX-NEXT: # xmm5 = mem[0],xmm3[1],mem[2,3,4,5,6,7]
11312 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
11313 ; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
11314 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
11315 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11316 ; AVX-NEXT: # xmm5 = mem[1,1,1,1]
11317 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11318 ; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11319 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
11320 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11321 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
11322 ; AVX-NEXT: # xmm6 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
11323 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
11324 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
11325 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
11326 ; AVX-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
11327 ; AVX-NEXT: # xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
11328 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11329 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11330 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
11331 ; AVX-NEXT: # xmm7 = mem[0,1,0,3]
11332 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
11333 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm7[6,7]
11334 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
11335 ; AVX-NEXT: # xmm7 = mem[3,3,3,3]
11336 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
11337 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7]
11338 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
11339 ; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0]
11340 ; AVX-NEXT: vandnps %ymm4, %ymm3, %ymm4
11341 ; AVX-NEXT: vandps %ymm3, %ymm5, %ymm5
11342 ; AVX-NEXT: vorps %ymm4, %ymm5, %ymm4
11343 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
11344 ; AVX-NEXT: # xmm5 = zero,xmm15[1],mem[0],zero
11345 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
11346 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
11347 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm8[1],xmm2[2,3,4,5,6,7]
11348 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
11349 ; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
11350 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm6[5,6,7]
11351 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
11352 ; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11353 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
11354 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11355 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
11356 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
11357 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
11358 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
11359 ; AVX-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
11360 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm13[0],xmm7[0],xmm13[1],xmm7[1],xmm13[2],xmm7[2],xmm13[3],xmm7[3]
11361 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
11362 ; AVX-NEXT: # xmm8 = mem[0,1,0,3]
11363 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
11364 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
11365 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,3,3,3]
11366 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
11367 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3,4,5,6,7]
11368 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
11369 ; AVX-NEXT: vandnps %ymm5, %ymm3, %ymm5
11370 ; AVX-NEXT: vandps %ymm3, %ymm6, %ymm6
11371 ; AVX-NEXT: vorps %ymm5, %ymm6, %ymm5
11372 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11373 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
11374 ; AVX-NEXT: # xmm6 = zero,xmm0[1],mem[0],zero
11375 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11376 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
11377 ; AVX-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3]
11378 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
11379 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11380 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
11381 ; AVX-NEXT: # xmm7 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
11382 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
11383 ; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
11384 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
11385 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
11386 ; AVX-NEXT: # xmm7 = mem[1,1,1,1]
11387 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11388 ; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11389 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
11390 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11391 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
11392 ; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11393 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
11394 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
11395 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
11396 ; AVX-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
11397 ; AVX-NEXT: # xmm8 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
11398 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11399 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
11400 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
11401 ; AVX-NEXT: # xmm9 = mem[0,1,0,3]
11402 ; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
11403 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7]
11404 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
11405 ; AVX-NEXT: # xmm9 = mem[3,3,3,3]
11406 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
11407 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5,6,7]
11408 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
11409 ; AVX-NEXT: vandnps %ymm6, %ymm3, %ymm6
11410 ; AVX-NEXT: vandps %ymm3, %ymm7, %ymm7
11411 ; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
11412 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11413 ; AVX-NEXT: vinsertps $41, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
11414 ; AVX-NEXT: # xmm7 = zero,xmm0[1],mem[0],zero
11415 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11416 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
11417 ; AVX-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
11418 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
11419 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11420 ; AVX-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
11421 ; AVX-NEXT: # xmm8 = mem[0],xmm0[1],mem[2,3,4,5,6,7]
11422 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
11423 ; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,7]
11424 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm8[5,6,7]
11425 ; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
11426 ; AVX-NEXT: # xmm8 = mem[1,1,1,1]
11427 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11428 ; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
11429 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
11430 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11431 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
11432 ; AVX-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11433 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,1,2,3]
11434 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
11435 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
11436 ; AVX-NEXT: vpmovzxwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
11437 ; AVX-NEXT: # xmm9 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
11438 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11439 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
11440 ; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
11441 ; AVX-NEXT: # xmm10 = mem[0,1,0,3]
11442 ; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
11443 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6,7]
11444 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
11445 ; AVX-NEXT: # xmm10 = mem[3,3,3,3]
11446 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9
11447 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
11448 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
11449 ; AVX-NEXT: vandnps %ymm7, %ymm3, %ymm7
11450 ; AVX-NEXT: vandps %ymm3, %ymm8, %ymm0
11451 ; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0
11452 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11453 ; AVX-NEXT: vmovaps %ymm1, 96(%rsi)
11454 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11455 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
11456 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11457 ; AVX-NEXT: vmovaps %ymm7, 64(%rsi)
11458 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11459 ; AVX-NEXT: vmovaps %ymm7, (%rsi)
11460 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11461 ; AVX-NEXT: vmovaps %ymm1, 96(%rdx)
11462 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11463 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
11464 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11465 ; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
11466 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11467 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
11468 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11469 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
11470 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11471 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
11472 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11473 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
11474 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11475 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
11476 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11477 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
11478 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11479 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
11480 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11481 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
11482 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11483 ; AVX-NEXT: vmovaps %ymm1, (%r8)
11484 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11485 ; AVX-NEXT: vmovaps %ymm1, 96(%r9)
11486 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11487 ; AVX-NEXT: vmovaps %ymm1, 32(%r9)
11488 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11489 ; AVX-NEXT: vmovaps %ymm1, (%r9)
11490 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11491 ; AVX-NEXT: vmovaps %ymm1, 64(%r9)
11492 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11493 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11494 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
11495 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11496 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
11497 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11498 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
11499 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11500 ; AVX-NEXT: vmovaps %ymm1, (%rax)
11501 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11502 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
11503 ; AVX-NEXT: vmovaps %ymm6, (%rax)
11504 ; AVX-NEXT: vmovaps %ymm5, 96(%rax)
11505 ; AVX-NEXT: vmovaps %ymm4, 64(%rax)
11506 ; AVX-NEXT: addq $1544, %rsp # imm = 0x608
11507 ; AVX-NEXT: vzeroupper
11510 ; AVX2-LABEL: load_i16_stride7_vf64:
11512 ; AVX2-NEXT: subq $1448, %rsp # imm = 0x5A8
11513 ; AVX2-NEXT: vmovdqa 256(%rdi), %ymm13
11514 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm14
11515 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm15
11516 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm12
11517 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm9
11518 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm10
11519 ; AVX2-NEXT: vmovdqa (%rdi), %ymm3
11520 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11521 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
11522 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11523 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
11524 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11525 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
11526 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11527 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
11528 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
11529 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
11530 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
11531 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
11532 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
11533 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
11534 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
11535 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm4
11536 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
11537 ; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
11538 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4,5],ymm9[6],ymm10[7]
11540 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11541 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11542 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11543 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
11544 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11545 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7]
11546 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11547 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11548 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
11549 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11550 ; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0
11551 ; AVX2-NEXT: vmovdqa %ymm1, %ymm8
11552 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
11553 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
11554 ; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11555 ; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11556 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11557 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
11558 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11559 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm7
11560 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm1
11561 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7]
11562 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11563 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11564 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
11565 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11566 ; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0
11567 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11568 ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm0
11569 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11570 ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm8
11571 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm0[2],ymm8[3,4,5],ymm0[6],ymm8[7]
11572 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11573 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
11574 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
11575 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm11
11576 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm5
11577 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7]
11578 ; AVX2-NEXT: vmovdqa %ymm5, %ymm0
11579 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
11580 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
11581 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,0,0,0]
11582 ; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
11583 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11584 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7]
11585 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
11586 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
11587 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
11588 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
11589 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
11590 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
11591 ; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm5
11592 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
11593 ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
11594 ; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
11595 ; AVX2-NEXT: vmovdqa %ymm6, %ymm9
11596 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11597 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm7[2],ymm1[3,4],ymm7[5],ymm1[6,7]
11598 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
11599 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
11600 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
11601 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
11602 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
11603 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
11604 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
11605 ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm4
11606 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11607 ; AVX2-NEXT: vmovdqa %ymm11, %ymm15
11608 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7]
11609 ; AVX2-NEXT: vmovdqa %ymm0, %ymm14
11610 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11611 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
11612 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
11613 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11614 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7]
11615 ; AVX2-NEXT: vmovdqa %ymm8, %ymm13
11616 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11617 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
11618 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
11619 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm4
11620 ; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
11621 ; AVX2-NEXT: vmovdqa %ymm9, %ymm7
11622 ; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm4, %ymm0
11623 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11624 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11625 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11626 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm1[2],ymm9[3,4],ymm1[5],ymm9[6,7]
11627 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
11628 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5,6,7,8,9,10],ymm5[11],ymm4[12,13,14,15]
11629 ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
11630 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11631 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11632 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
11633 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11634 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
11635 ; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
11636 ; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm0
11637 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11638 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
11639 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7]
11640 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4],ymm3[5,6,7,8,9,10,11],ymm2[12],ymm3[13,14,15]
11641 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2,3],ymm11[4],ymm8[5,6,7]
11642 ; AVX2-NEXT: vmovdqa %ymm8, %ymm10
11643 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
11644 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
11645 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
11646 ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm5
11647 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
11648 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11649 ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm0
11650 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11651 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11652 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11653 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7]
11654 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11655 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
11656 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11657 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
11659 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm9[2,3,0,1]
11660 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
11661 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11662 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11663 ; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
11664 ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
11665 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11666 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
11667 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11668 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
11669 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11670 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
11671 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm15[2,3,0,1]
11672 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
11673 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
11674 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
11675 ; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
11676 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11677 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11678 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11679 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
11680 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
11681 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
11682 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
11683 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11684 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11685 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm15[2,3],ymm6[4,5],ymm15[6,7]
11686 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
11687 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
11688 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
11689 ; AVX2-NEXT: vmovdqa %ymm0, %ymm5
11690 ; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
11691 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11692 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
11693 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
11694 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
11695 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11696 ; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
11697 ; AVX2-NEXT: # ymm2 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7]
11698 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[1,3,2,3]
11699 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
11700 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11701 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
11702 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
11703 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
11704 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11705 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7]
11706 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
11707 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11708 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
11709 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
11710 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
11711 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
11712 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11713 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
11714 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11715 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
11716 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
11717 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11718 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
11719 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
11720 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
11721 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
11722 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
11723 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm0
11724 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11725 ; AVX2-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
11726 ; AVX2-NEXT: # ymm3 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7]
11727 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
11728 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
11729 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11730 ; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
11731 ; AVX2-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7]
11732 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
11733 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
11734 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
11735 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
11736 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm0
11737 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11738 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0
11739 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2
11740 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
11741 ; AVX2-NEXT: vmovdqa %ymm2, %ymm10
11742 ; AVX2-NEXT: vmovdqa %ymm0, %ymm12
11743 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,4,7]
11744 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
11745 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
11746 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
11747 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
11748 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11749 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
11750 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11751 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm0[0,1,0,2]
11752 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,2,1,4,5,6,5]
11753 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
11754 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
11755 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
11756 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm1[5,6,7]
11757 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11758 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm13
11759 ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm11
11760 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
11761 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
11762 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
11763 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
11764 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
11765 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
11766 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11767 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1
11768 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11769 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[0,1,0,2]
11770 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm4[0,1,2,1,4,5,6,5]
11771 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
11772 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
11773 ; AVX2-NEXT: vpblendd $31, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
11774 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
11775 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11776 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm5
11777 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm6
11778 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
11779 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11780 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11781 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,4,7]
11782 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
11783 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
11784 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
11785 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
11786 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
11787 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm14
11788 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm14[0,1,0,2]
11789 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[0,1,2,1,4,5,6,5]
11790 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
11791 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7]
11792 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
11793 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
11794 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11795 ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm7
11796 ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm1
11797 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm7[3],ymm1[4,5],ymm7[6],ymm1[7]
11798 ; AVX2-NEXT: vmovdqa %ymm1, %ymm9
11799 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,4,7]
11800 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
11801 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
11802 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
11803 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
11804 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11805 ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm2
11806 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,2]
11807 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,1,2,1,4,5,6,5]
11808 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
11809 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
11810 ; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11811 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
11812 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11813 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm11[2],ymm13[3,4,5],ymm11[6],ymm13[7]
11814 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm15
11815 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5],xmm15[6],xmm0[7]
11816 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
11817 ; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
11818 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11819 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,5,7]
11820 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
11821 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
11822 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11823 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
11824 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11825 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11826 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
11827 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
11828 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
11829 ; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
11830 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11831 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,5,7]
11832 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
11833 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11834 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11835 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11836 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11837 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11838 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm9[2],ymm7[3,4,5],ymm9[6],ymm7[7]
11839 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
11840 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7]
11841 ; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
11842 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11843 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
11844 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
11845 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11846 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11847 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11848 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11849 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11850 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
11851 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
11852 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
11853 ; AVX2-NEXT: vpshufb %xmm15, %xmm0, %xmm0
11854 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11855 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,1,3,4,5,5,7]
11856 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
11857 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11858 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11859 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11860 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11861 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11862 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
11863 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
11864 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
11865 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
11866 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
11867 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11868 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11869 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm8[0,1,1,2]
11870 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
11871 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11872 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11873 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11874 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
11875 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11876 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11877 ; AVX2-NEXT: vmovdqa %ymm11, %ymm6
11878 ; AVX2-NEXT: vmovdqa %ymm13, %ymm4
11879 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm11[3],ymm13[4,5],ymm11[6],ymm13[7]
11880 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
11881 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
11882 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
11883 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11884 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11885 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm13[0,1,1,2]
11886 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
11887 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11888 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11889 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11890 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
11891 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11892 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11893 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm9[3],ymm7[4,5],ymm9[6],ymm7[7]
11894 ; AVX2-NEXT: vmovdqa %ymm7, %ymm5
11895 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11896 ; AVX2-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
11897 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
11898 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
11899 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
11900 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
11901 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,1,2]
11902 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,0,3,4,5,4,7]
11903 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11904 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
11905 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11906 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
11907 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11908 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11909 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11910 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11911 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm11[3],ymm7[4,5],ymm11[6],ymm7[7]
11912 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
11913 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
11914 ; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
11915 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11916 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,2]
11917 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
11918 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
11919 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11920 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11921 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11922 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11923 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11924 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
11925 ; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11926 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11927 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
11928 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
11929 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
11930 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
11931 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11932 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[0,1,1,3]
11933 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11934 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
11935 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
11936 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11937 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11938 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11939 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11940 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11941 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
11942 ; AVX2-NEXT: vmovdqa %ymm4, %ymm8
11943 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11944 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11945 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
11946 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
11947 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
11948 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
11949 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11950 ; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm13[0,1,1,3]
11951 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,2,0,4,5,6,4]
11952 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
11953 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
11954 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11955 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
11956 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11957 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11958 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6,7]
11959 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
11960 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
11961 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
11962 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
11963 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
11964 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm14[0,1,1,3]
11965 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
11966 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
11967 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
11968 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11969 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
11970 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11971 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11972 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
11973 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
11974 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
11975 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
11976 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
11977 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
11978 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,1,3]
11979 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
11980 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
11981 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11982 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11983 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
11984 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
11985 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11986 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11987 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11988 ; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
11989 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
11990 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
11991 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
11992 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
11993 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
11994 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11995 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11996 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
11997 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
11998 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm5
11999 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
12000 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15]
12001 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
12002 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
12003 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
12004 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2],xmm3[3],xmm6[4,5,6,7]
12005 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12006 ; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
12007 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12008 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,2,1,4,5,6,5]
12009 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12010 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12011 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
12012 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
12013 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12014 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
12015 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
12016 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
12017 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
12018 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
12019 ; AVX2-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12020 ; AVX2-NEXT: # ymm5 = mem[0,1,2,1,4,5,6,5]
12021 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12022 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
12023 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12024 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12025 ; AVX2-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
12026 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
12027 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
12028 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
12029 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
12030 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
12031 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12032 ; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12033 ; AVX2-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3,4,5],ymm6[6],mem[7]
12034 ; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12035 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
12036 ; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12037 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12038 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
12039 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
12040 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12041 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12042 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload
12043 ; AVX2-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
12044 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
12045 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
12046 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
12047 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
12048 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
12049 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12050 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12051 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
12052 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm6
12053 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
12054 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
12055 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
12056 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12057 ; AVX2-NEXT: vmovdqa %ymm11, %ymm8
12058 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
12059 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
12060 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
12061 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
12062 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12063 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,1,4,5,6,5]
12064 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12065 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
12066 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12067 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
12068 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12069 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12070 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12071 ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12072 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
12073 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
12074 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
12075 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
12076 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
12077 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12078 ; AVX2-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
12079 ; AVX2-NEXT: # ymm4 = mem[0,1],ymm11[2],mem[3,4,5],ymm11[6],mem[7]
12080 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
12081 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
12082 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7,8,9,10,11,12,13],ymm4[14],ymm2[15]
12083 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
12084 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12085 ; AVX2-NEXT: vpblendd $221, (%rsp), %ymm7, %ymm4 # 32-byte Folded Reload
12086 ; AVX2-NEXT: # ymm4 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7]
12087 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
12088 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
12089 ; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
12090 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12091 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,1,4,5,6,5]
12092 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
12093 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
12094 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12095 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12096 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12097 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7]
12098 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12099 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12100 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12101 ; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12102 ; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12103 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
12104 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
12105 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
12106 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
12107 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12108 ; AVX2-NEXT: vmovdqa 656(%rdi), %xmm13
12109 ; AVX2-NEXT: vmovdqa 640(%rdi), %xmm14
12110 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
12111 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
12112 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
12113 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12114 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
12115 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12116 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12117 ; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
12118 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
12119 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
12120 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
12121 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
12122 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
12123 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
12124 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
12125 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12126 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12127 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
12128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12129 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12130 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
12131 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12132 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12133 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
12134 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
12135 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
12136 ; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
12137 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12138 ; AVX2-NEXT: vmovdqa 432(%rdi), %xmm4
12139 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12140 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm5
12141 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12142 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
12143 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
12144 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
12145 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12146 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7],ymm1[8,9,10,11,12],ymm5[13,14,15]
12147 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
12148 ; AVX2-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
12149 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
12150 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
12151 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
12152 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
12153 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
12154 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
12155 ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12156 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12157 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
12158 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12159 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12160 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12161 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
12162 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12163 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12164 ; AVX2-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
12165 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
12166 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
12167 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
12168 ; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12169 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
12170 ; AVX2-NEXT: vmovdqa 880(%rdi), %xmm1
12171 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm0
12172 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
12173 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
12174 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
12175 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12176 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
12177 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12178 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
12179 ; AVX2-NEXT: # ymm7 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
12180 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm15
12181 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
12182 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
12183 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
12184 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
12185 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
12186 ; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
12187 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0],ymm6[1,2,3,4,5,6,7],ymm5[8],ymm6[9,10,11,12,13,14,15]
12188 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
12189 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
12190 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12191 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12192 ; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12193 ; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
12194 ; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
12195 ; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
12196 ; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3
12197 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
12198 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm15
12199 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm3
12200 ; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
12201 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
12202 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
12203 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12204 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
12205 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12206 ; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12207 ; AVX2-NEXT: # ymm6 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
12208 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
12209 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7,8],ymm6[9,10,11,12,13,14],ymm7[15]
12210 ; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2
12211 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12212 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12213 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1],ymm11[2,3],ymm12[4,5],ymm11[6,7]
12214 ; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
12215 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
12216 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7]
12217 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
12218 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
12219 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
12220 ; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm5[1,2,3,4,5,6,7],ymm2[8],ymm5[9,10,11,12,13,14,15]
12221 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
12222 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
12223 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12224 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12225 ; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12226 ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
12227 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm2[2,3,0,1]
12228 ; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm5[1],ymm2[2,3,4,5,6,7,8],ymm5[9],ymm2[10,11,12,13,14,15]
12229 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12230 ; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12231 ; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
12232 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
12233 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
12234 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
12235 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
12236 ; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
12237 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12238 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
12239 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
12240 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12241 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2,3,4],ymm7[5,6,7],ymm2[8,9,10,11,12],ymm7[13,14,15]
12242 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12243 ; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12244 ; AVX2-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
12245 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,6,4,6,7]
12246 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
12247 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
12248 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
12249 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
12250 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
12251 ; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
12252 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15]
12253 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
12254 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
12255 ; AVX2-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
12256 ; AVX2-NEXT: # ymm7 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6,7]
12257 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm7[2,3,0,1]
12258 ; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm10[1],ymm7[2,3,4,5,6,7,8],ymm10[9],ymm7[10,11,12,13,14,15]
12259 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
12260 ; AVX2-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
12261 ; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
12262 ; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
12263 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
12264 ; AVX2-NEXT: vpshufb %xmm5, %xmm9, %xmm1
12265 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12266 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
12267 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
12268 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12269 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
12270 ; AVX2-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
12271 ; AVX2-NEXT: # ymm1 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
12272 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm1[0,1,2,3,6,4,6,7]
12273 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
12274 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
12275 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,7]
12276 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
12277 ; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm7
12278 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1,2,3,4,5,6,7],ymm7[8],ymm0[9,10,11,12,13,14,15]
12279 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
12280 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12281 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12282 ; AVX2-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12283 ; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7]
12284 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1]
12285 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4,5,6,7,8],ymm7[9],ymm1[10,11,12,13,14,15]
12286 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12287 ; AVX2-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
12288 ; AVX2-NEXT: # ymm7 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7]
12289 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9
12290 ; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
12291 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12292 ; AVX2-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
12293 ; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
12294 ; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
12295 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
12296 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
12297 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
12298 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
12299 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7],ymm7[8,9,10,11,12],ymm4[13,14,15]
12300 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12301 ; AVX2-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
12302 ; AVX2-NEXT: # ymm7 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
12303 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm7[0,1,2,3,6,4,6,7]
12304 ; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm7
12305 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
12306 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
12307 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
12308 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
12309 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm4[1,2,3,4,5,6,7],ymm1[8],ymm4[9,10,11,12,13,14,15]
12310 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
12311 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
12312 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12313 ; AVX2-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
12314 ; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7]
12315 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
12316 ; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
12317 ; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
12318 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
12319 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
12320 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
12321 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
12322 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12323 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
12324 ; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12325 ; AVX2-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
12326 ; AVX2-NEXT: # ymm4 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6,7]
12327 ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
12328 ; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6,7,8],ymm5[9],ymm4[10,11,12,13,14,15]
12329 ; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm2
12330 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
12331 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,6,4,6,7]
12332 ; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
12333 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
12334 ; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
12335 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
12336 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
12337 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
12338 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12339 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12340 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
12341 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12342 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
12343 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12344 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
12345 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12346 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
12347 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12348 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
12349 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12350 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
12351 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12352 ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
12353 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12354 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
12355 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12356 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
12357 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12358 ; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
12359 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12360 ; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
12361 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12362 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
12363 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12364 ; AVX2-NEXT: vmovaps %ymm3, 96(%r8)
12365 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12366 ; AVX2-NEXT: vmovaps %ymm3, 32(%r8)
12367 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12368 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
12369 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12370 ; AVX2-NEXT: vmovaps %ymm3, (%r8)
12371 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12372 ; AVX2-NEXT: vmovaps %ymm3, 96(%r9)
12373 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12374 ; AVX2-NEXT: vmovaps %ymm3, 32(%r9)
12375 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12376 ; AVX2-NEXT: vmovaps %ymm3, (%r9)
12377 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12378 ; AVX2-NEXT: vmovaps %ymm3, 64(%r9)
12379 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
12380 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12381 ; AVX2-NEXT: vmovaps %ymm3, 96(%rax)
12382 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12383 ; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
12384 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12385 ; AVX2-NEXT: vmovaps %ymm3, 64(%rax)
12386 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12387 ; AVX2-NEXT: vmovaps %ymm3, (%rax)
12388 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
12389 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rax)
12390 ; AVX2-NEXT: vmovdqa %ymm2, (%rax)
12391 ; AVX2-NEXT: vmovdqa %ymm0, 96(%rax)
12392 ; AVX2-NEXT: vmovdqa %ymm6, 64(%rax)
12393 ; AVX2-NEXT: addq $1448, %rsp # imm = 0x5A8
12394 ; AVX2-NEXT: vzeroupper
12397 ; AVX2-FP-LABEL: load_i16_stride7_vf64:
12398 ; AVX2-FP: # %bb.0:
12399 ; AVX2-FP-NEXT: subq $1448, %rsp # imm = 0x5A8
12400 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm13
12401 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm15
12402 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm11
12403 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm14
12404 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm9
12405 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm12
12406 ; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
12407 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12408 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
12409 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12410 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1
12411 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
12412 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm0
12413 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12414 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
12415 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
12416 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
12417 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12418 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
12419 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3
12420 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6],xmm1[7]
12421 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
12422 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm4
12423 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
12424 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
12425 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12426 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm9[2],ymm12[3,4,5],ymm9[6],ymm12[7]
12427 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12428 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12429 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
12430 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12431 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
12432 ; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12433 ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12434 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
12435 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12436 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
12437 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12438 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4,5],ymm13[6],ymm15[7]
12439 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12440 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12441 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12442 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
12443 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12444 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm8
12445 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm7
12446 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
12447 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12448 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12449 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
12450 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12451 ; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
12452 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12453 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm0
12454 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12455 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm10
12456 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2],ymm10[3,4,5],ymm0[6],ymm10[7]
12457 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12458 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7]
12459 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12460 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm6
12461 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm1
12462 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7]
12463 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12464 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
12465 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12466 ; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,0,0,0]
12467 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
12468 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12469 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
12470 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
12471 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7,8,9,10],ymm2[11],ymm0[12,13,14,15]
12472 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7]
12473 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
12474 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
12475 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,8,9,6,7,4,5,18,19,18,19,18,19,18,19,24,25,24,25,22,23,20,21]
12476 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12477 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
12478 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
12479 ; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm0, %ymm0
12480 ; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm9
12481 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12482 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
12483 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
12484 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
12485 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
12486 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
12487 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
12488 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12489 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
12490 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
12491 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12492 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
12493 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm8
12494 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
12495 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
12496 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12497 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7]
12498 ; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm14
12499 ; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12500 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
12501 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
12502 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12503 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
12504 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm6
12505 ; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm4, %ymm0, %ymm0
12506 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12507 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12508 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
12509 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
12510 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
12511 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7,8,9,10],ymm4[11],ymm0[12,13,14,15]
12512 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
12513 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12514 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12515 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
12516 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
12517 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
12518 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
12519 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
12520 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12521 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm11
12522 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
12523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
12524 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
12525 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
12526 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm10
12527 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
12528 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
12529 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,4,5,4,5,4,5,8,9,10,11,8,9,6,7,20,21,20,21,20,21,20,21,24,25,26,27,24,25,22,23]
12530 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12531 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
12532 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
12533 ; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm0, %ymm0
12534 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12535 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12536 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12537 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
12538 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12539 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12540 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12541 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12542 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm15[2,3],ymm1[4,5],ymm15[6,7]
12543 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm15[2,3,0,1]
12544 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
12545 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12546 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12547 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
12548 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12549 ; AVX2-FP-NEXT: vmovdqa %ymm13, %ymm7
12550 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
12551 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12552 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12553 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12554 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12555 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
12556 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,3,0,1]
12557 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7,8,9,10,11],ymm5[12],ymm4[13,14,15]
12558 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12559 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12560 ; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm8
12561 ; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
12562 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12563 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12564 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12565 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6,7]
12566 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12567 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
12568 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12569 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12570 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12571 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm14[2,3],ymm5[4,5],ymm14[6,7]
12572 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm14[2,3,0,1]
12573 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7,8,9,10,11],ymm4[12],ymm3[13,14,15]
12574 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12575 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
12576 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12577 ; AVX2-FP-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
12578 ; AVX2-FP-NEXT: # ymm0 = mem[0],ymm10[1],mem[2,3,4],ymm10[5],mem[6,7]
12579 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
12580 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
12581 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12582 ; AVX2-FP-NEXT: vpblendd $72, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
12583 ; AVX2-FP-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5],mem[6],ymm2[7]
12584 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[1,3,2,3]
12585 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
12586 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm4
12587 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
12588 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12589 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12590 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12591 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7]
12592 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12593 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12594 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12595 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
12596 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
12597 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12598 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12599 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12600 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7]
12601 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12602 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12603 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12604 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
12605 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
12606 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
12607 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
12608 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12609 ; AVX2-FP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
12610 ; AVX2-FP-NEXT: # ymm0 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7]
12611 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm4
12612 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
12613 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
12614 ; AVX2-FP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
12615 ; AVX2-FP-NEXT: # ymm3 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
12616 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
12617 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12618 ; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0
12619 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12620 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm9
12621 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
12622 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12623 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
12624 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12625 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,6,7,6,7]
12626 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12627 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12628 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12629 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12630 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm1
12631 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12632 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,2]
12633 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
12634 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm15, %ymm1
12635 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12636 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12637 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12638 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12639 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm8
12640 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm7
12641 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
12642 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12643 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12644 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12645 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12646 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12647 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1
12648 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12649 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,1,0,2]
12650 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm5, %ymm1
12651 ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11
12652 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12653 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12654 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12655 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12656 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm3
12657 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm6
12658 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
12659 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12660 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12661 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12662 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
12663 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12664 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
12665 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
12666 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm0
12667 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12668 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
12669 ; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm13
12670 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
12671 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
12672 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm12[5,6,7]
12673 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12674 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm4
12675 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm10
12676 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
12677 ; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm12
12678 ; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm13
12679 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm10
12680 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm2
12681 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
12682 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
12683 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12684 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm14
12685 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm14[0,1,0,2]
12686 ; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12687 ; AVX2-FP-NEXT: vpshufb %ymm11, %ymm10, %ymm11
12688 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
12689 ; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12690 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
12691 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12692 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
12693 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11
12694 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
12695 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
12696 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12697 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12698 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
12699 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm5
12700 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7]
12701 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12702 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
12703 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12704 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12705 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm6[2],ymm3[3,4,5],ymm6[6],ymm3[7]
12706 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm5
12707 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
12708 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12709 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12710 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
12711 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12712 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12713 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12714 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12715 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12716 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm12[2],ymm4[3,4,5],ymm12[6],ymm4[7]
12717 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12718 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12719 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12720 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12721 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm1
12722 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12723 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12724 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12725 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12726 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12727 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12728 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
12729 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12730 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
12731 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
12732 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm15, %ymm1
12733 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12734 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12735 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12736 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12737 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12738 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12739 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
12740 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm6
12741 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12742 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
12743 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
12744 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12745 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12746 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12747 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,2]
12748 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31,16,17,30,31]
12749 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12750 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12751 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12752 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12753 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12754 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12755 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
12756 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
12757 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12758 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12759 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12760 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12761 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm4[0,1,1,2]
12762 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12763 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12764 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12765 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12766 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12767 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12768 ; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12769 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12770 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
12771 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
12772 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12773 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12774 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12775 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,1,2]
12776 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
12777 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
12778 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12779 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12780 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12781 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12782 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12783 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12784 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7]
12785 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
12786 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
12787 ; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
12788 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12789 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,2]
12790 ; AVX2-FP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
12791 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
12792 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
12793 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12794 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
12795 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12796 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12797 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5,6,7]
12798 ; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm14
12799 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12800 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
12801 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
12802 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
12803 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12804 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12805 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm11[0,1,1,3]
12806 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12807 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
12808 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
12809 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
12810 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12811 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
12812 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
12813 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12814 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5,6,7]
12815 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12816 ; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm11
12817 ; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12818 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm5
12819 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
12820 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12821 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12822 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[0,1,1,3]
12823 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm5, %ymm6
12824 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
12825 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12826 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12827 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
12828 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12829 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7]
12830 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6
12831 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3,4,5],xmm6[6],xmm1[7]
12832 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
12833 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm6
12834 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,1,3]
12835 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm7
12836 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12837 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12838 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
12839 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
12840 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12841 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
12842 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
12843 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
12844 ; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
12845 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
12846 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12847 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
12848 ; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm2
12849 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
12850 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12851 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3,4,5,6,7],ymm4[8],ymm2[9,10,11,12,13,14,15]
12852 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
12853 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12854 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12855 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12856 ; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
12857 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
12858 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm4
12859 ; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm12
12860 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
12861 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
12862 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
12863 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12864 ; AVX2-FP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
12865 ; AVX2-FP-NEXT: # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7]
12866 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
12867 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12868 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12869 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12870 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
12871 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm8[1],ymm11[2,3,4],ymm8[5],ymm11[6,7]
12872 ; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm8
12873 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
12874 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
12875 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
12876 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
12877 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
12878 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
12879 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5,6],ymm5[7]
12880 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6,7],ymm7[8],ymm5[9,10,11,12,13,14,15]
12881 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
12882 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12883 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12884 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
12885 ; AVX2-FP-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7]
12886 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm5, %xmm6
12887 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm5
12888 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
12889 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
12890 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
12891 ; AVX2-FP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
12892 ; AVX2-FP-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4,5],mem[6],ymm6[7]
12893 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
12894 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
12895 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
12896 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
12897 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7]
12898 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8
12899 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
12900 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
12901 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
12902 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12903 ; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
12904 ; AVX2-FP-NEXT: vmovdqa %ymm9, %ymm8
12905 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6],ymm3[7]
12906 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2,3,4,5,6,7],ymm7[8],ymm3[9,10,11,12,13,14,15]
12907 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
12908 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12909 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12910 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12911 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
12912 ; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm5
12913 ; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm14
12914 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm3
12915 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
12916 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
12917 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12918 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
12919 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
12920 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm5, %ymm6
12921 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
12922 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7,8,9,10,11,12,13],ymm5[14],ymm6[15]
12923 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
12924 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12925 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12926 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7]
12927 ; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
12928 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
12929 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
12930 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
12931 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
12932 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
12933 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm1[1,2,3,4,5,6,7],ymm6[8],ymm1[9,10,11,12,13,14,15]
12934 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
12935 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12936 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12937 ; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12938 ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
12939 ; AVX2-FP-NEXT: vpshufb %xmm14, %xmm1, %xmm3
12940 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm1
12941 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
12942 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
12943 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12944 ; AVX2-FP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
12945 ; AVX2-FP-NEXT: # ymm3 = ymm14[0,1],mem[2],ymm14[3,4,5],mem[6],ymm14[7]
12946 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
12947 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
12948 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
12949 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
12950 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
12951 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12952 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
12953 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5
12954 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
12955 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
12956 ; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
12957 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
12958 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
12959 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15]
12960 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12961 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12962 ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
12963 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
12964 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
12965 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
12966 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12967 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
12968 ; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
12969 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
12970 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4],xmm2[5,6,7]
12971 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
12972 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
12973 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
12974 ; AVX2-FP-NEXT: vmovdqa 656(%rdi), %xmm3
12975 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12976 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm2
12977 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12978 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7]
12979 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
12980 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
12981 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
12982 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
12983 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12984 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12985 ; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
12986 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
12987 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
12988 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm8
12989 ; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
12990 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
12991 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
12992 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
12993 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
12994 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
12995 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
12996 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12997 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12998 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
12999 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13000 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13001 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
13002 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm8
13003 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7]
13004 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
13005 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13006 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm9
13007 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13008 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm8
13009 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13010 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7]
13011 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
13012 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13013 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
13014 ; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload
13015 ; AVX2-FP-NEXT: # ymm8 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7]
13016 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
13017 ; AVX2-FP-NEXT: vmovdqa %xmm13, %xmm9
13018 ; AVX2-FP-NEXT: vpshufb %xmm13, %xmm11, %xmm11
13019 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
13020 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3]
13021 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
13022 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
13023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
13024 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13025 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13026 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13027 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
13028 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
13029 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
13030 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
13031 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
13032 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
13033 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
13034 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
13035 ; AVX2-FP-NEXT: vmovdqa 880(%rdi), %xmm1
13036 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm0
13037 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
13038 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm15
13039 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
13040 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15]
13041 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13042 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
13043 ; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm6[2,3],mem[4,5],ymm6[6,7]
13044 ; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm13
13045 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm13, %xmm13
13046 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm15, %xmm15
13047 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
13048 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
13049 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0],ymm11[1,2,3,4,5,6,7],ymm8[8],ymm11[9,10,11,12,13,14,15]
13050 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm13[0,1],xmm8[2,3]
13051 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm11[4,5,6,7]
13052 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13053 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13054 ; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
13055 ; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm6[2],mem[3,4],ymm6[5],mem[6,7]
13056 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
13057 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7]
13058 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm8
13059 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm14
13060 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm5
13061 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7]
13062 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm3
13063 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13064 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13065 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4],ymm3[5,6,7],ymm8[8,9,10,11,12],ymm3[13,14,15]
13066 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload
13067 ; AVX2-FP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
13068 ; AVX2-FP-NEXT: # ymm8 = ymm6[0,1,2],mem[3],ymm6[4,5],mem[6],ymm6[7]
13069 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
13070 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6],ymm13[7,8],ymm8[9,10,11,12,13,14],ymm13[15]
13071 ; AVX2-FP-NEXT: vpshufb %ymm2, %ymm8, %ymm2
13072 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13073 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13074 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
13075 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm13
13076 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm13, %xmm10
13077 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
13078 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
13079 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm3[1,2,3,4,5,6,7],ymm2[8],ymm3[9,10,11,12,13,14,15]
13080 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
13081 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm3[4,5,6,7]
13082 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13083 ; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13084 ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6,7]
13085 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
13086 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6,7,8],ymm3[9],ymm2[10,11,12,13,14,15]
13087 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13088 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13089 ; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7]
13090 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
13091 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7]
13092 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13093 ; AVX2-FP-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
13094 ; AVX2-FP-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
13095 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
13096 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
13097 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13098 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15]
13099 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13100 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
13101 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
13102 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13103 ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
13104 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7]
13105 ; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9
13106 ; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
13107 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9
13108 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
13109 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
13110 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
13111 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
13112 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
13113 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
13114 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7]
13115 ; AVX2-FP-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload
13116 ; AVX2-FP-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6,7]
13117 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
13118 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6,7,8],ymm9[9],ymm8[10,11,12,13,14,15]
13119 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13120 ; AVX2-FP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
13121 ; AVX2-FP-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
13122 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
13123 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7]
13124 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
13125 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm1
13126 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13127 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
13128 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13129 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
13130 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13131 ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13132 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7]
13133 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm9
13134 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9
13135 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
13136 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
13137 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
13138 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
13139 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
13140 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13141 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13142 ; AVX2-FP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13143 ; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7]
13144 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
13145 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4,5,6,7,8],ymm8[9],ymm1[10,11,12,13,14,15]
13146 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13147 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
13148 ; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
13149 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
13150 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
13151 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13152 ; AVX2-FP-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
13153 ; AVX2-FP-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7]
13154 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
13155 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
13156 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm9
13157 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
13158 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
13159 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13160 ; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
13161 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
13162 ; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
13163 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm12
13164 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
13165 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
13166 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
13167 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0],ymm8[1,2,3,4,5,6,7],ymm1[8],ymm8[9,10,11,12,13,14,15]
13168 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
13169 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
13170 ; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13171 ; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
13172 ; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
13173 ; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
13174 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
13175 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
13176 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7]
13177 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
13178 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
13179 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13180 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
13181 ; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
13182 ; AVX2-FP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
13183 ; AVX2-FP-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6,7]
13184 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
13185 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7,8],ymm6[9],ymm5[10,11,12,13,14,15]
13186 ; AVX2-FP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
13187 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
13188 ; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
13189 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
13190 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,4,6,7]
13191 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
13192 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
13193 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
13194 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
13195 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13196 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi)
13197 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13198 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
13199 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13200 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
13201 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13202 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
13203 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13204 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx)
13205 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13206 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
13207 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13208 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx)
13209 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13210 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
13211 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13212 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
13213 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13214 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx)
13215 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13216 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx)
13217 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13218 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx)
13219 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13220 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8)
13221 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13222 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8)
13223 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13224 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8)
13225 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13226 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
13227 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13228 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r9)
13229 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13230 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9)
13231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13232 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r9)
13233 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13234 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r9)
13235 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13236 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13237 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax)
13238 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13239 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax)
13240 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13241 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax)
13242 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax)
13243 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
13244 ; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rax)
13245 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%rax)
13246 ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax)
13247 ; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax)
13248 ; AVX2-FP-NEXT: addq $1448, %rsp # imm = 0x5A8
13249 ; AVX2-FP-NEXT: vzeroupper
13250 ; AVX2-FP-NEXT: retq
13252 ; AVX2-FCP-LABEL: load_i16_stride7_vf64:
13253 ; AVX2-FCP: # %bb.0:
13254 ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608
13255 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm6
13256 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm7
13257 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm14
13258 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm15
13259 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13260 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12
13261 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13262 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm13
13263 ; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13264 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
13265 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
13266 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
13267 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
13268 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13269 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
13270 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11
13271 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
13272 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,28,29,30,31,18,19,22,23,28,29,18,19]
13273 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
13274 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7]
13275 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10
13276 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13277 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
13278 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,8,9,8,9,8,9]
13279 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm2
13280 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
13281 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm0
13282 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13283 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
13284 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
13285 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
13286 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
13287 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
13288 ; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13289 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
13290 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
13291 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0
13292 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13293 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
13294 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm15
13295 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13296 ; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm13
13297 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13298 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
13299 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
13300 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
13301 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm1
13302 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm12
13303 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm1[1],ymm12[2,3,4],ymm1[5],ymm12[6,7]
13304 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13305 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13306 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
13307 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
13308 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm3, %ymm6, %ymm0
13309 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
13310 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm2
13311 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13312 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
13313 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13314 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7]
13315 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
13316 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
13317 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
13318 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm2
13319 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm0
13320 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13321 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
13322 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm3
13323 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13324 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
13325 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
13326 ; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm0
13327 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm2
13328 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13329 ; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm2
13330 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13331 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13332 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
13333 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13334 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7]
13335 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13336 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13337 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
13338 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,1,0,4,0,0,0]
13339 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
13340 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
13341 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8
13342 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
13343 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13344 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
13345 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13346 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13347 ; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
13348 ; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5],mem[6],ymm7[7]
13349 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13350 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
13351 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13352 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
13353 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
13354 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8
13355 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
13356 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
13357 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm14
13358 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13359 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7]
13360 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13361 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
13362 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13363 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm1[2],ymm12[3,4],ymm1[5],ymm12[6,7]
13364 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8
13365 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
13366 ; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
13367 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13368 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13369 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13370 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7]
13371 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13372 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
13373 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
13374 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm3[2],ymm15[3,4],ymm3[5],ymm15[6,7]
13376 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
13377 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
13378 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm3
13379 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13380 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6,7]
13381 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13382 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
13383 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
13384 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,1,0,5,0,0,0]
13385 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
13386 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
13387 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm8
13388 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
13389 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13390 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2
13391 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13392 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13393 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13394 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5,6,7]
13395 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13396 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
13397 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13398 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13399 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13400 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
13401 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8
13402 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
13403 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm1
13404 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13405 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm1
13406 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
13407 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13408 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
13409 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
13410 ; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm11
13411 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13412 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
13413 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm8
13414 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
13415 ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm7
13416 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13417 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13418 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13419 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7]
13420 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
13421 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
13422 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
13423 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13424 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13425 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7]
13426 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm4, %ymm4
13427 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
13428 ; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,0,0,0]
13429 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4
13430 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13431 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13432 ; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
13433 ; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7]
13434 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13435 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
13436 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13437 ; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
13438 ; AVX2-FCP-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7]
13439 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[1,3,2,3]
13440 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17]
13441 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7
13442 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15]
13443 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
13444 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
13445 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13446 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
13447 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
13448 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
13449 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
13450 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
13451 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
13452 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7
13453 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm3
13454 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13455 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7]
13456 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
13457 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
13458 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
13459 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
13460 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
13461 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7
13462 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm2
13463 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13464 ; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
13465 ; AVX2-FCP-NEXT: # ymm6 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
13466 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
13467 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
13468 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
13469 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2],ymm0[3],ymm11[4,5],ymm0[6],ymm11[7]
13470 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[1,3,2,3]
13471 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
13472 ; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm4, %ymm0
13473 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13474 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
13475 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
13476 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm10[3],ymm1[4,5],ymm10[6],ymm1[7]
13477 ; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm11
13478 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [3,6,2,5,3,6,2,5]
13479 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
13480 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
13481 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31]
13482 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm6
13483 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
13484 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13485 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
13486 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
13487 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm7
13488 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm14
13489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
13490 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
13491 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm6[5,6,7]
13492 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13493 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12
13494 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm7
13495 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm12[3],ymm7[4,5],ymm12[6],ymm7[7]
13496 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm5, %ymm0
13497 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm8
13498 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
13499 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13500 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13501 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2
13502 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5,6],ymm2[7]
13503 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
13504 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm2[5,6,7]
13505 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13506 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm0
13507 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
13508 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
13509 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm15
13510 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13511 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
13512 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm8
13513 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0
13514 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13515 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
13516 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm9
13517 ; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm13
13518 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
13519 ; AVX2-FCP-NEXT: vpblendd $31, (%rsp), %ymm8, %ymm0 # 32-byte Folded Reload
13520 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm8[5,6,7]
13521 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13522 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm0
13523 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm8
13524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1,2],ymm0[3],ymm8[4,5],ymm0[6],ymm8[7]
13525 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9
13526 ; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm5
13527 ; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
13528 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm0
13529 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13530 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
13531 ; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm14
13532 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm14[7]
13533 ; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
13534 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm5[5,6,7]
13535 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13536 ; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm13
13537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
13538 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14
13539 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm14[4],xmm5[5],xmm14[6],xmm5[7]
13540 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
13541 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5
13542 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
13543 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
13544 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4
13545 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
13546 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13547 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
13548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
13549 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13550 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm7[2],ymm12[3,4,5],ymm7[6],ymm12[7]
13551 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13552 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
13553 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
13554 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
13555 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
13556 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
13557 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13558 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm1[1,2,3,4,5,6,7],ymm4[8],ymm1[9,10,11,12,13,14,15]
13559 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
13560 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm6[2],ymm15[3,4,5],ymm6[6],ymm15[7]
13562 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
13563 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
13564 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
13565 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13566 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
13567 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13568 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13569 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
13570 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13571 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13572 ; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15
13573 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
13574 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13575 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
13576 ; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
13577 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
13578 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13579 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13580 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13581 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
13582 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13583 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13584 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
13585 ; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm14
13586 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13587 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13588 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
13589 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
13590 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
13591 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
13592 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13593 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
13594 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13595 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm2, %ymm3
13596 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13597 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
13598 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13599 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
13600 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13601 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13602 ; AVX2-FCP-NEXT: vmovdqa %ymm7, %ymm5
13603 ; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm10
13604 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7]
13605 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
13606 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
13607 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
13608 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13609 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13610 ; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm3
13611 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13612 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
13613 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13614 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
13615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13616 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13617 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
13618 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13619 ; AVX2-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
13620 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
13621 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
13622 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
13623 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13624 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13625 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm3
13626 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13627 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
13628 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13629 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
13630 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13631 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13632 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13633 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13634 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
13635 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
13636 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
13637 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
13638 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13639 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13640 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm1
13641 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
13642 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
13643 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13644 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
13645 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13646 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13647 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7]
13648 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
13649 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
13650 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
13651 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
13652 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
13653 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
13654 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13655 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
13656 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm2
13657 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13658 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13659 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
13660 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13661 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13662 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6,7]
13663 ; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm14
13664 ; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13665 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm0
13666 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13667 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13668 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
13669 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
13670 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13671 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,1,3]
13672 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
13673 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
13674 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13675 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
13676 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
13677 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13678 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6,7]
13679 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
13680 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
13681 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
13682 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13683 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[0,1,1,3]
13684 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13685 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
13686 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
13687 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13688 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
13689 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
13690 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13691 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7]
13692 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
13693 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
13694 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
13695 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13696 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm9[0,1,1,3]
13697 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13698 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
13699 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
13700 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13701 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15]
13702 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13703 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13704 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13705 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13706 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
13707 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
13708 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm3
13709 ; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm10
13710 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
13711 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
13712 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13713 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
13714 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
13715 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4,5,6,7]
13716 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
13717 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
13718 ; AVX2-FCP-NEXT: vmovdqa %xmm0, %xmm1
13719 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
13720 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
13721 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
13722 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm2[7]
13723 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13724 ; AVX2-FCP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
13725 ; AVX2-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
13726 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,3,7,2,6,0,0,0]
13727 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5
13728 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
13729 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5
13730 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
13731 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
13732 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
13733 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13734 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13735 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13736 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm7[2],ymm13[3,4],ymm7[5],ymm13[6,7]
13737 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm4
13738 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
13739 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
13740 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
13741 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13742 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13743 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
13744 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
13745 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
13746 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4
13747 ; AVX2-FCP-NEXT: vmovdqa %xmm1, %xmm10
13748 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
13749 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13750 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
13751 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
13752 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13753 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13754 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
13755 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm4
13756 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
13757 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
13758 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
13759 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13760 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13761 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13762 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13763 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
13764 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
13765 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
13766 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
13767 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
13768 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
13769 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13770 ; AVX2-FCP-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
13771 ; AVX2-FCP-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7]
13772 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
13773 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
13774 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
13775 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
13776 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13777 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
13778 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
13779 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13780 ; AVX2-FCP-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
13781 ; AVX2-FCP-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7]
13782 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm12, %ymm4
13783 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
13784 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10
13785 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
13786 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
13787 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
13788 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13789 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13790 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13791 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
13792 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
13793 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
13794 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
13795 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
13796 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13797 ; AVX2-FCP-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
13798 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7]
13799 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm12, %ymm2
13800 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13801 ; AVX2-FCP-NEXT: vpblendd $221, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
13802 ; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7]
13803 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
13804 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
13805 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
13806 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13807 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
13808 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
13809 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
13810 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
13811 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
13812 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
13813 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13814 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13815 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,3,0,3,7,7]
13816 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
13817 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
13818 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
13819 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,5,1,4,2,5,1,4]
13820 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
13821 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
13822 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
13823 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
13824 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
13825 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
13826 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
13827 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
13828 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
13829 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13830 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
13831 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,4,7,3,6,0,0,0]
13832 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm0
13833 ; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm5
13834 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
13835 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm12[1,2,3,4,5,6,7],ymm0[8],ymm12[9,10,11,12,13,14,15]
13836 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
13837 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
13838 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13839 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
13840 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
13841 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
13842 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13843 ; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
13844 ; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
13845 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
13846 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
13847 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm12
13848 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
13849 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13850 ; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm12 # 32-byte Folded Reload
13851 ; AVX2-FCP-NEXT: # ymm12 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
13852 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
13853 ; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13
13854 ; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm6
13855 ; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
13856 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
13857 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
13858 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13859 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13860 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
13861 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm4
13862 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm13
13863 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
13864 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm13
13865 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
13866 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
13867 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
13868 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13869 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
13870 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
13871 ; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
13872 ; AVX2-FCP-NEXT: # ymm12 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7]
13873 ; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
13874 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm12
13875 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
13876 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13877 ; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
13878 ; AVX2-FCP-NEXT: # ymm12 = mem[0,1],ymm8[2,3],mem[4,5],ymm8[6,7]
13879 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
13880 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm13
13881 ; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
13882 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
13883 ; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm13 # 32-byte Folded Reload
13884 ; AVX2-FCP-NEXT: # ymm13 = ymm9[0,1,2],mem[3],ymm9[4,5],mem[6],ymm9[7]
13885 ; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm13
13886 ; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm2
13887 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm13
13888 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
13889 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
13890 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
13891 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13892 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13893 ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm11, %ymm0
13894 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
13895 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13896 ; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
13897 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
13898 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
13899 ; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
13900 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
13901 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
13902 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13903 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
13904 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2
13905 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1
13906 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13907 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
13908 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
13909 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
13910 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
13911 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
13912 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13913 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
13914 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13915 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13916 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13917 ; AVX2-FCP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
13918 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5],ymm14[6,7]
13919 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
13920 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
13921 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
13922 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
13923 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5
13924 ; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,6,1,5,2,6,1,5]
13925 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
13926 ; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
13927 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
13928 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7],ymm0[8,9,10,11,12],ymm5[13,14,15]
13929 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
13930 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
13931 ; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
13932 ; AVX2-FCP-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7]
13933 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10
13934 ; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
13935 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm10
13936 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
13937 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
13938 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,4,0,3,7,0,0,0]
13939 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm6
13940 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
13941 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm6
13942 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
13943 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
13944 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
13945 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
13946 ; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm7
13947 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7
13948 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm6
13949 ; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
13950 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm6
13951 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7],ymm6[8,9,10,11,12],ymm7[13,14,15]
13952 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm12[1],ymm8[2,3],ymm12[4],ymm8[5,6,7]
13953 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
13954 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
13955 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm9
13956 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
13957 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
13958 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
13959 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
13960 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
13961 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
13962 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
13963 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
13964 ; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
13965 ; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1],mem[2,3],ymm7[4,5],mem[6,7]
13966 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
13967 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
13968 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm7
13969 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm7
13970 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
13971 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13972 ; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
13973 ; AVX2-FCP-NEXT: # ymm8 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5,6,7]
13974 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13975 ; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
13976 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2],ymm9[3],mem[4,5],ymm9[6],mem[7]
13977 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
13978 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm11
13979 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,4,6,7]
13980 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
13981 ; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
13982 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
13983 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
13984 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
13985 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
13986 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13987 ; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
13988 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13989 ; AVX2-FCP-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13990 ; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7]
13991 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
13992 ; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
13993 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
13994 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13995 ; AVX2-FCP-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13996 ; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7]
13997 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm10, %ymm2
13998 ; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
13999 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
14000 ; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
14001 ; AVX2-FCP-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5],mem[6],ymm3[7]
14002 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
14003 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
14004 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
14005 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
14006 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
14007 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
14008 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14009 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14010 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi)
14011 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14012 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi)
14013 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14014 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi)
14015 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14016 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
14017 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14018 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx)
14019 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14020 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
14021 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14022 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx)
14023 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14024 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
14025 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14026 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx)
14027 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14028 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx)
14029 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14030 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx)
14031 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14032 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx)
14033 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14034 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8)
14035 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14036 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8)
14037 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14038 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8)
14039 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14040 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
14041 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14042 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r9)
14043 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14044 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9)
14045 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14046 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9)
14047 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14048 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9)
14049 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14050 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14051 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
14052 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14053 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax)
14054 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14055 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax)
14056 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14057 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
14058 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14059 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax)
14060 ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rax)
14061 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 96(%rax)
14062 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
14063 ; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608
14064 ; AVX2-FCP-NEXT: vzeroupper
14065 ; AVX2-FCP-NEXT: retq
14067 ; AVX512-LABEL: load_i16_stride7_vf64:
14069 ; AVX512-NEXT: subq $1864, %rsp # imm = 0x748
14070 ; AVX512-NEXT: vmovdqa 480(%rdi), %ymm1
14071 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
14072 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
14073 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16
14074 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18
14075 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14076 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
14077 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
14078 ; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
14079 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm3
14080 ; AVX512-NEXT: vmovdqa 544(%rdi), %ymm4
14081 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
14082 ; AVX512-NEXT: vmovdqa %ymm4, %ymm10
14083 ; AVX512-NEXT: vmovdqa %ymm3, %ymm13
14084 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
14085 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
14086 ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
14087 ; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm2
14088 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14089 ; AVX512-NEXT: vpbroadcastw 700(%rdi), %xmm2
14090 ; AVX512-NEXT: vmovdqa 672(%rdi), %xmm4
14091 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
14092 ; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22
14093 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
14094 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
14095 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14096 ; AVX512-NEXT: vmovdqa (%rdi), %ymm4
14097 ; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
14098 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6
14099 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm15
14100 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
14101 ; AVX512-NEXT: vmovdqa %ymm6, %ymm12
14102 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
14103 ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
14104 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
14105 ; AVX512-NEXT: vmovdqa %ymm5, %ymm6
14106 ; AVX512-NEXT: vmovdqa %ymm4, %ymm8
14107 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
14108 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
14109 ; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0
14110 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
14111 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14112 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
14113 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
14114 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21
14115 ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm21[0,1,0,2]
14116 ; AVX512-NEXT: vpbroadcastw 252(%rdi), %xmm0
14117 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm4
14118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
14119 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
14120 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
14121 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14122 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
14123 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7]
14124 ; AVX512-NEXT: vmovdqa %ymm12, %ymm14
14125 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
14126 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14127 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
14128 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
14129 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17
14130 ; AVX512-NEXT: vmovdqa64 %ymm6, %ymm19
14131 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
14132 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
14133 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14134 ; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
14135 ; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0
14136 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14137 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
14138 ; AVX512-NEXT: vmovdqa64 %ymm11, %ymm20
14139 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5
14140 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
14141 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
14142 ; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0
14143 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14144 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
14145 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14146 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
14147 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm0
14148 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm4[1],xmm0[2,3,4,5,6,7]
14149 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm23
14150 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
14151 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
14152 ; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
14153 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14154 ; AVX512-NEXT: vmovdqa 528(%rdi), %xmm7
14155 ; AVX512-NEXT: vmovdqa %ymm10, %ymm12
14156 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
14157 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
14158 ; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm3
14159 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm10
14160 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm11
14161 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
14162 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
14163 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
14164 ; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6
14165 ; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
14166 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14167 ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0
14168 ; AVX512-NEXT: vmovdqa 608(%rdi), %ymm6
14169 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7]
14170 ; AVX512-NEXT: vmovdqa64 %ymm6, %ymm26
14171 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30
14172 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
14173 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
14174 ; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm3
14175 ; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm16
14176 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
14177 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
14178 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
14179 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
14180 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
14181 ; AVX512-NEXT: vmovdqa 688(%rdi), %xmm3
14182 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
14183 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
14184 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
14185 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
14186 ; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14187 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14188 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
14189 ; AVX512-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14190 ; AVX512-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14191 ; AVX512-NEXT: vmovdqa64 %ymm14, %ymm22
14192 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
14193 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
14194 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm14
14195 ; AVX512-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14196 ; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1
14197 ; AVX512-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14198 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
14199 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
14200 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
14201 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14202 ; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm8
14203 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14204 ; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm6
14205 ; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6
14206 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14207 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0
14208 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
14209 ; AVX512-NEXT: vmovdqa64 %ymm9, %ymm19
14210 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8
14211 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
14212 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
14213 ; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm8
14214 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14215 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,1,1,2]
14216 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
14217 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14218 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
14219 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm0
14220 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
14221 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14222 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
14223 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
14224 ; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
14225 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14226 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
14227 ; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14228 ; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14229 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
14230 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
14231 ; AVX512-NEXT: vpshufb %ymm7, %ymm8, %ymm7
14232 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
14233 ; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14234 ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14235 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
14236 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
14237 ; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5
14238 ; AVX512-NEXT: vpor %ymm7, %ymm5, %ymm5
14239 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14240 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm9
14241 ; AVX512-NEXT: vmovdqa64 %ymm30, %ymm0
14242 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
14243 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7
14244 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
14245 ; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5
14246 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14247 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm16[0,1,1,2]
14248 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
14249 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
14250 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14251 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14252 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14253 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
14254 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
14255 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
14256 ; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14257 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14258 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
14259 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
14260 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14261 ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0
14262 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7]
14263 ; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
14264 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
14265 ; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm8
14266 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
14267 ; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7
14268 ; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm1
14269 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14270 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
14271 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
14272 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
14273 ; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6
14274 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
14275 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
14276 ; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm5
14277 ; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm0
14278 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14279 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm15
14280 ; AVX512-NEXT: vmovdqa64 %ymm19, %ymm10
14281 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
14282 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
14283 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14284 ; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
14285 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
14286 ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
14287 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14288 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
14289 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14290 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
14291 ; AVX512-NEXT: vpbroadcastw 232(%rdi), %xmm6
14292 ; AVX512-NEXT: vpsrlq $48, %xmm23, %xmm7
14293 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14294 ; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
14295 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14296 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm12
14297 ; AVX512-NEXT: vmovdqa64 %ymm30, %ymm13
14298 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
14299 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
14300 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
14301 ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
14302 ; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
14303 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14304 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5]
14305 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14306 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
14307 ; AVX512-NEXT: vpbroadcastw 680(%rdi), %xmm5
14308 ; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm6
14309 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
14310 ; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0
14311 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14312 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
14313 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5
14314 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7]
14315 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
14316 ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
14317 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
14318 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm6
14319 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm7
14320 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6],xmm7[7]
14321 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
14322 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
14323 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14324 ; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
14325 ; AVX512-NEXT: vmovdqa %xmm4, %xmm2
14326 ; AVX512-NEXT: vpsrld $16, %xmm4, %xmm8
14327 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4
14328 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
14329 ; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0
14330 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14331 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
14332 ; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm8
14333 ; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7]
14334 ; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
14335 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
14336 ; AVX512-NEXT: vmovdqa 656(%rdi), %xmm1
14337 ; AVX512-NEXT: vmovdqa 640(%rdi), %xmm5
14338 ; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm1[6],xmm5[7]
14339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
14340 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6]
14341 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
14342 ; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
14343 ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm11
14344 ; AVX512-NEXT: vpsrld $16, %xmm28, %xmm9
14345 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
14346 ; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
14347 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14348 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
14349 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
14350 ; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
14351 ; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
14352 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
14353 ; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm8
14354 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
14355 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
14356 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
14357 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
14358 ; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
14359 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
14360 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
14361 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
14362 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0
14363 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14364 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
14365 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
14366 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14367 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
14368 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
14369 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
14370 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14371 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
14372 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14373 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
14374 ; AVX512-NEXT: movw $992, %ax # imm = 0x3E0
14375 ; AVX512-NEXT: kmovw %eax, %k1
14376 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14377 ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
14378 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14379 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
14380 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm17
14381 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
14382 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
14383 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
14384 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5,6,7]
14385 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14386 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
14387 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
14388 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14389 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
14390 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
14391 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14392 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
14393 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
14394 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14395 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1
14396 ; AVX512-NEXT: vmovdqa 736(%rdi), %ymm2
14397 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
14398 ; AVX512-NEXT: vmovdqa %ymm2, %ymm5
14399 ; AVX512-NEXT: vmovdqa %ymm1, %ymm7
14400 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14401 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
14402 ; AVX512-NEXT: vmovdqa 800(%rdi), %ymm3
14403 ; AVX512-NEXT: vmovdqa 768(%rdi), %ymm2
14404 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
14405 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20
14406 ; AVX512-NEXT: vmovdqa %ymm2, %ymm4
14407 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
14408 ; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
14409 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
14410 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
14411 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
14412 ; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
14413 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
14414 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14415 ; AVX512-NEXT: vmovdqa 832(%rdi), %ymm3
14416 ; AVX512-NEXT: vmovdqa 864(%rdi), %ymm8
14417 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
14418 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm21
14419 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
14420 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
14421 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14422 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
14423 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14424 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14425 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14426 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14427 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
14428 ; AVX512-NEXT: vmovdqa64 %ymm15, %ymm22
14429 ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm23
14430 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
14431 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
14432 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
14433 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
14434 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
14435 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14436 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,1,4,5,6,5]
14437 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
14438 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
14439 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14440 ; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
14441 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14442 ; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2
14443 ; AVX512-NEXT: vmovdqa 288(%rdi), %ymm10
14444 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
14445 ; AVX512-NEXT: vmovdqa %ymm10, %ymm15
14446 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25
14447 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
14448 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
14449 ; AVX512-NEXT: vmovdqa 352(%rdi), %ymm14
14450 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10
14451 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
14452 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
14453 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15]
14454 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
14455 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,1]
14456 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
14457 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7]
14458 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14459 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm1
14460 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm3
14461 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
14462 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm12
14463 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
14464 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
14465 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
14466 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
14467 ; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
14468 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
14469 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14470 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2
14471 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
14472 ; AVX512-NEXT: vmovdqa64 %ymm15, %ymm24
14473 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm12
14474 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7]
14475 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7]
14476 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
14477 ; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
14478 ; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
14479 ; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0
14480 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
14481 ; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12
14482 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
14483 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
14484 ; AVX512-NEXT: vmovdqa %ymm1, %ymm15
14485 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
14486 ; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm25
14487 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
14488 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14489 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
14490 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
14491 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
14492 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14493 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
14494 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14495 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
14496 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
14497 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7]
14498 ; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0
14499 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm6
14500 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
14501 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
14502 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
14503 ; AVX512-NEXT: vpshufb %ymm9, %ymm11, %ymm1
14504 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
14505 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14506 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm9
14507 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
14508 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
14509 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
14510 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
14511 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14512 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
14513 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
14514 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14515 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14516 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14517 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
14518 ; AVX512-NEXT: vmovdqa64 %ymm14, %ymm21
14519 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
14520 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm11
14521 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14522 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
14523 ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm14
14524 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
14525 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
14526 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
14527 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
14528 ; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm11
14529 ; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
14530 ; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm11
14531 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
14532 ; AVX512-NEXT: vmovdqa %ymm15, %ymm13
14533 ; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm25
14534 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14535 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1]
14536 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
14537 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
14538 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14539 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
14540 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14541 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
14542 ; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1
14543 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14544 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
14545 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
14546 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
14547 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
14548 ; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1
14549 ; AVX512-NEXT: vpternlogq $242, %ymm0, %ymm20, %ymm1
14550 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
14551 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
14552 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
14553 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
14554 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
14555 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
14556 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14557 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14558 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14559 ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0
14560 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm1
14561 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
14562 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14563 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
14564 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14565 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14566 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14567 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
14568 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14569 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
14570 ; AVX512-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14571 ; AVX512-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
14572 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
14573 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
14574 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14575 ; AVX512-NEXT: vmovdqa %ymm2, %ymm15
14576 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7]
14577 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14578 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
14579 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2],ymm13[3,4,5],ymm3[6],ymm13[7]
14580 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19
14581 ; AVX512-NEXT: vmovdqa64 %ymm13, %ymm26
14582 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
14583 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
14584 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
14585 ; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
14586 ; AVX512-NEXT: vmovdqa %ymm10, %ymm2
14587 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10
14588 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7]
14589 ; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
14590 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
14591 ; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm12
14592 ; AVX512-NEXT: vpor %ymm0, %ymm12, %ymm0
14593 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
14594 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
14595 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14596 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
14597 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14598 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14599 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
14600 ; AVX512-NEXT: vmovdqa64 %ymm30, %ymm1
14601 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
14602 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14603 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
14604 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
14605 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
14606 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14607 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4]
14608 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
14609 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
14610 ; AVX512-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14611 ; AVX512-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
14612 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
14613 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25
14614 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
14615 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24
14616 ; AVX512-NEXT: vmovdqa64 %ymm7, %ymm23
14617 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14618 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
14619 ; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
14620 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
14621 ; AVX512-NEXT: vmovdqa64 %ymm6, %ymm21
14622 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm22
14623 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
14624 ; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm1
14625 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
14626 ; AVX512-NEXT: vmovdqa %ymm8, %ymm5
14627 ; AVX512-NEXT: vmovdqa %ymm9, %ymm4
14628 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
14629 ; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
14630 ; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
14631 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
14632 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
14633 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
14634 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
14635 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14636 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14637 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14638 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
14639 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0
14640 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
14641 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm31
14642 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
14643 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm11
14644 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14645 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
14646 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
14647 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
14648 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1
14649 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
14650 ; AVX512-NEXT: vmovdqa64 %ymm6, %ymm28
14651 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
14652 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
14653 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
14654 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
14655 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
14656 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
14657 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14658 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm7
14659 ; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8
14660 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
14661 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
14662 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
14663 ; AVX512-NEXT: vmovdqa %ymm10, %ymm9
14664 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
14665 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
14666 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,0,1]
14667 ; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
14668 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7]
14669 ; AVX512-NEXT: vmovdqa64 %ymm15, %ymm26
14670 ; AVX512-NEXT: vmovdqa64 %ymm14, %ymm27
14671 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15
14672 ; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
14673 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
14674 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
14675 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14676 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
14677 ; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm11
14678 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
14679 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
14680 ; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm13
14681 ; AVX512-NEXT: vpor %ymm11, %ymm13, %ymm11
14682 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
14683 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14684 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14685 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
14686 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7]
14687 ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm29
14688 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
14689 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
14690 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15]
14691 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
14692 ; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
14693 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
14694 ; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30
14695 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
14696 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
14697 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
14698 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
14699 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
14700 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
14701 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
14702 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
14703 ; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm3
14704 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14705 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
14706 ; AVX512-NEXT: vmovdqa %ymm4, %ymm14
14707 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18
14708 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
14709 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
14710 ; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
14711 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm12
14712 ; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4
14713 ; AVX512-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14714 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
14715 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm21[0,1,0,1]
14716 ; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7,8,9,10],ymm11[11],ymm3[12,13,14,15]
14717 ; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
14718 ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm6
14719 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm5
14720 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
14721 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
14722 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7]
14723 ; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
14724 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14725 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
14726 ; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
14727 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14728 ; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
14729 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
14730 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
14731 ; AVX512-NEXT: vmovdqa64 %ymm9, %ymm24
14732 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm23
14733 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14734 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
14735 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
14736 ; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17
14737 ; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
14738 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
14739 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
14740 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2
14741 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
14742 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
14743 ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
14744 ; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
14745 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
14746 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
14747 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
14748 ; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1
14749 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14750 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14751 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
14752 ; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm2
14753 ; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22
14754 ; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
14755 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14756 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14757 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm10
14758 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm8
14759 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
14760 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14761 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
14762 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm9
14763 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm7
14764 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
14765 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
14766 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14767 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
14768 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14769 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
14770 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14771 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
14772 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
14773 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
14774 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14775 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
14776 ; AVX512-NEXT: vmovdqa64 %ymm12, %ymm20
14777 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14778 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
14779 ; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
14780 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm12
14781 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7]
14782 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
14783 ; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
14784 ; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1
14785 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
14786 ; AVX512-NEXT: vmovdqa64 %ymm5, %ymm28
14787 ; AVX512-NEXT: vmovdqa64 %ymm6, %ymm31
14788 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
14789 ; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3],xmm11[4,5,6,7]
14790 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4
14791 ; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
14792 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14793 ; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
14794 ; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0
14795 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14796 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
14797 ; AVX512-NEXT: vmovdqa64 %ymm29, %ymm1
14798 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7]
14799 ; AVX512-NEXT: vmovdqa64 %ymm29, %ymm16
14800 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14801 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
14802 ; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
14803 ; AVX512-NEXT: vmovdqa64 %ymm30, %ymm2
14804 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
14805 ; AVX512-NEXT: vmovdqa64 %ymm13, %ymm29
14806 ; AVX512-NEXT: vmovdqa64 %ymm30, %ymm13
14807 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
14808 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
14809 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
14810 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14811 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
14812 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14813 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
14814 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
14815 ; AVX512-NEXT: vpternlogq $184, %zmm0, %zmm19, %zmm30
14816 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7]
14817 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
14818 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
14819 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
14820 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
14821 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
14822 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
14823 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
14824 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
14825 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
14826 ; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
14827 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
14828 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
14829 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
14830 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
14831 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
14832 ; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
14833 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1
14834 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm2
14835 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
14836 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
14837 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
14838 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
14839 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
14840 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14841 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
14842 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
14843 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
14844 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
14845 ; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5
14846 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm7
14847 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
14848 ; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,3,1]
14849 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
14850 ; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm7
14851 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4,5,6,7]
14852 ; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
14853 ; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm0
14854 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14855 ; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
14856 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
14857 ; AVX512-NEXT: vmovdqa64 %ymm16, %ymm6
14858 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
14859 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
14860 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7,8],ymm8[9],ymm6[10,11,12,13,14,15]
14861 ; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6
14862 ; AVX512-NEXT: vmovdqa64 %ymm29, %ymm8
14863 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
14864 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
14865 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
14866 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
14867 ; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
14868 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
14869 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
14870 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
14871 ; AVX512-NEXT: vpternlogq $184, %zmm6, %zmm19, %zmm13
14872 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
14873 ; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8
14874 ; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
14875 ; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
14876 ; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7
14877 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm8
14878 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
14879 ; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
14880 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
14881 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
14882 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7]
14883 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
14884 ; AVX512-NEXT: vmovdqa64 %ymm20, %ymm8
14885 ; AVX512-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
14886 ; AVX512-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7]
14887 ; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
14888 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
14889 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
14890 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14891 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
14892 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
14893 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload
14894 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
14895 ; AVX512-NEXT: vpternlogq $184, %zmm10, %zmm9, %zmm2
14896 ; AVX512-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm4
14897 ; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
14898 ; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5
14899 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
14900 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
14901 ; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
14902 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
14903 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
14904 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
14905 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
14906 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
14907 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
14908 ; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm14 # 32-byte Folded Reload
14909 ; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
14910 ; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm23
14911 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
14912 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
14913 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
14914 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload
14915 ; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00
14916 ; AVX512-NEXT: kmovw %eax, %k1
14917 ; AVX512-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1}
14918 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
14919 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
14920 ; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1}
14921 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7
14922 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
14923 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
14924 ; AVX512-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1}
14925 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
14926 ; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload
14927 ; AVX512-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1}
14928 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
14929 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
14930 ; AVX512-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1}
14931 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi)
14932 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rsi)
14933 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rdx)
14934 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
14935 ; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx)
14936 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
14937 ; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8)
14938 ; AVX512-NEXT: vmovdqa64 %zmm8, (%r8)
14939 ; AVX512-NEXT: vmovdqa64 %zmm6, (%r9)
14940 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14941 ; AVX512-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1}
14942 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
14943 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
14944 ; AVX512-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1}
14945 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9)
14946 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
14947 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
14948 ; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload
14949 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1
14950 ; AVX512-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1}
14951 ; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1}
14952 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax)
14953 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
14954 ; AVX512-NEXT: vmovdqa64 %zmm11, (%rax)
14955 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0
14956 ; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1}
14957 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
14958 ; AVX512-NEXT: addq $1864, %rsp # imm = 0x748
14959 ; AVX512-NEXT: vzeroupper
14960 ; AVX512-NEXT: retq
14962 ; AVX512-FCP-LABEL: load_i16_stride7_vf64:
14963 ; AVX512-FCP: # %bb.0:
14964 ; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708
14965 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
14966 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
14967 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
14968 ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm0
14969 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
14970 ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm2
14971 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
14972 ; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm4
14973 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22
14974 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
14975 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6
14976 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
14977 ; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
14978 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
14979 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
14980 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
14981 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
14982 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6
14983 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
14984 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
14985 ; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm17
14986 ; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
14987 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
14988 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
14989 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
14990 ; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
14991 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
14992 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
14993 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2]
14994 ; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14995 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
14996 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
14997 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
14998 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
14999 ; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm8
15000 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
15001 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
15002 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
15003 ; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm16
15004 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm14
15005 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2
15006 ; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3
15007 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15008 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15009 ; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
15010 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15011 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
15012 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
15013 ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
15014 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
15015 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
15016 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
15017 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15018 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
15019 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
15020 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15021 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
15022 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
15023 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
15024 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
15025 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
15026 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
15027 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
15028 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
15029 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
15030 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
15031 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
15032 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
15033 ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7
15034 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
15035 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
15036 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
15037 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15038 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13
15039 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15
15040 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
15041 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
15042 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
15043 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
15044 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1
15045 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
15046 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1
15047 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15048 ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm4
15049 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
15050 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7]
15051 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
15052 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
15053 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm25
15054 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
15055 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2]
15056 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
15057 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
15058 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
15059 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7]
15060 ; AVX512-FCP-NEXT: vmovdqa 688(%rdi), %xmm3
15061 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
15062 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15063 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7]
15064 ; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
15065 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0]
15066 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0
15067 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15068 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
15069 ; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15070 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
15071 ; AVX512-FCP-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15072 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7]
15073 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
15074 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
15075 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29
15076 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15077 ; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9
15078 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15079 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
15080 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15081 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
15082 ; AVX512-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
15083 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15084 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
15085 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
15086 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm26
15087 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
15088 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
15089 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
15090 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
15091 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
15092 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
15093 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5]
15094 ; AVX512-FCP-NEXT: vpermd %ymm31, %ymm18, %ymm12
15095 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15096 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7]
15097 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3]
15098 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
15099 ; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm19
15100 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15101 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14
15102 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm2
15103 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15104 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7]
15105 ; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15106 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
15107 ; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15108 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
15109 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
15110 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm10
15111 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm23, %zmm7
15112 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15113 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6
15114 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6
15115 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15116 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
15117 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
15118 ; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4
15119 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
15120 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
15121 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5
15122 ; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
15123 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
15124 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm18, %ymm7
15125 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15126 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
15127 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
15128 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
15129 ; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9
15130 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm1
15131 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15132 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7]
15133 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9
15134 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
15135 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
15136 ; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm10
15137 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15138 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10
15139 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15140 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
15141 ; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm0
15142 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15143 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
15144 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm5
15145 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7]
15146 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
15147 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7]
15148 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
15149 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
15150 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14
15151 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
15152 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,1,3]
15153 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm13
15154 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
15155 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
15156 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
15157 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm0
15158 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15159 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
15160 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
15161 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7]
15162 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8
15163 ; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm12
15164 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6
15165 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
15166 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15167 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1
15168 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2
15169 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7]
15170 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
15171 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
15172 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
15173 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
15174 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3]
15175 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm10
15176 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
15177 ; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7
15178 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
15179 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15180 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
15181 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
15182 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7]
15183 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
15184 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
15185 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
15186 ; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
15187 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm6
15188 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
15189 ; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm6
15190 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm5
15191 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm20, %xmm9
15192 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
15193 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm27
15194 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
15195 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4
15196 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21
15197 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13]
15198 ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm24, %zmm6
15199 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
15200 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6
15201 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
15202 ; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0
15203 ; AVX512-FCP-NEXT: kmovw %eax, %k1
15204 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm17 {%k1}
15205 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15206 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
15207 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
15208 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
15209 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
15210 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15211 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1
15212 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
15213 ; AVX512-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1
15214 ; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9
15215 ; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2
15216 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
15217 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
15218 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
15219 ; AVX512-FCP-NEXT: vpermd %ymm31, %ymm2, %ymm0
15220 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
15221 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6
15222 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
15223 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
15224 ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm8
15225 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
15226 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm8
15227 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15]
15228 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm15
15229 ; AVX512-FCP-NEXT: vpsrld $16, %xmm19, %xmm8
15230 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
15231 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3
15232 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15233 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm3
15234 ; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm8
15235 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
15236 ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
15237 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
15238 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
15239 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7]
15240 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
15241 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8
15242 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30
15243 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14]
15244 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm6, %zmm13
15245 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
15246 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13
15247 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7]
15248 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
15249 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15250 ; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm7
15251 ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm13
15252 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm11
15253 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
15254 ; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload
15255 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15256 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2
15257 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
15258 ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm7
15259 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0
15260 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
15261 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
15262 ; AVX512-FCP-NEXT: vpsrld $16, %xmm14, %xmm2
15263 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
15264 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
15265 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15266 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
15267 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
15268 ; AVX512-FCP-NEXT: vpermd %ymm31, %ymm12, %ymm0
15269 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
15270 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
15271 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
15272 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
15273 ; AVX512-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm7
15274 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
15275 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7
15276 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
15277 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
15278 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
15279 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15280 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
15281 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
15282 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
15283 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15284 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0
15285 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm7
15286 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
15287 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7
15288 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7]
15289 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15290 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2
15291 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm4
15292 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
15293 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11
15294 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7]
15295 ; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11
15296 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
15297 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm6, %zmm6
15298 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
15299 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7]
15300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
15301 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm10
15302 ; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm1
15303 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
15304 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
15305 ; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
15306 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
15307 ; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm15
15308 ; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm4
15309 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
15310 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
15311 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
15312 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15313 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15]
15314 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm11, %zmm13
15315 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
15316 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13
15317 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7]
15318 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
15319 ; AVX512-FCP-NEXT: vpermd %zmm10, %zmm17, %zmm13
15320 ; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
15321 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm13
15322 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
15323 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15324 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
15325 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm8
15326 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
15327 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13
15328 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7]
15329 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15330 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm6
15331 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
15332 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
15333 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
15334 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm3
15335 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
15336 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
15337 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15338 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
15339 ; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm13
15340 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
15341 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
15342 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
15343 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15344 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
15345 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6
15346 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
15347 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6
15348 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
15349 ; AVX512-FCP-NEXT: vpermd %zmm10, %zmm24, %zmm6
15350 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
15351 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
15352 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15353 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
15354 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm10
15355 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
15356 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
15357 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15358 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2
15359 ; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
15360 ; AVX512-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
15361 ; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm0
15362 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
15363 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15364 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15365 ; AVX512-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm0
15366 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
15367 ; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
15368 ; AVX512-FCP-NEXT: vpermd %zmm21, %zmm17, %zmm1
15369 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
15370 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
15371 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
15372 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
15373 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
15374 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
15375 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
15376 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15377 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
15378 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm15
15379 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7]
15380 ; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7
15381 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
15382 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
15383 ; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11
15384 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
15385 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
15386 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
15387 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
15388 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
15389 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15390 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9]
15391 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6
15392 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
15393 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
15394 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
15395 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
15396 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
15397 ; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
15398 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15399 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15400 ; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %ymm1
15401 ; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %ymm5
15402 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7]
15403 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm21
15404 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
15405 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
15406 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
15407 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
15408 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm1
15409 ; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
15410 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
15411 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm23
15412 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
15413 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
15414 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
15415 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15416 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
15417 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
15418 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15419 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15420 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15421 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15422 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
15423 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
15424 ; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
15425 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25
15426 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
15427 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
15428 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15429 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0]
15430 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
15431 ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm5, %zmm3
15432 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
15433 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
15434 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
15435 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
15436 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm27
15437 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15438 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
15439 ; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12
15440 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
15441 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
15442 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
15443 ; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22
15444 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
15445 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
15446 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
15447 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
15448 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15449 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15450 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0]
15451 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm20, %zmm13
15452 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
15453 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13
15454 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
15455 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
15456 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
15457 ; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
15458 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15459 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15460 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
15461 ; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
15462 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
15463 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13
15464 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm3
15465 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
15466 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
15467 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15468 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
15469 ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3
15470 ; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
15471 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
15472 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm1, %zmm29, %zmm26
15473 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15474 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
15475 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
15476 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
15477 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
15478 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
15479 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15480 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm3
15481 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
15482 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14
15483 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7]
15484 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm20
15485 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
15486 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
15487 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
15488 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15489 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
15490 ; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
15491 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15492 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15493 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
15494 ; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm23
15495 ; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26
15496 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
15497 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
15498 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm27
15499 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
15500 ; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
15501 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
15502 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
15503 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0]
15504 ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm21, %zmm2
15505 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
15506 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
15507 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
15508 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15509 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7]
15510 ; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
15511 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
15512 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
15513 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9
15514 ; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2
15515 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7]
15516 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13
15517 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7]
15518 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
15519 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
15520 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15521 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0]
15522 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm18, %zmm1
15523 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
15524 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
15525 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
15526 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
15527 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
15528 ; AVX512-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1
15529 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15530 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31
15531 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
15532 ; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22
15533 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
15534 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6
15535 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
15536 ; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6
15537 ; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
15538 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
15539 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload
15540 ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm21, %zmm1
15541 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
15542 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
15543 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
15544 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28
15545 ; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27
15546 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
15547 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7]
15548 ; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload
15549 ; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
15550 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm18, %zmm8
15551 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3
15552 ; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm6
15553 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
15554 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12
15555 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7]
15556 ; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm4
15557 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15558 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
15559 ; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
15560 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15561 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
15562 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
15563 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3
15564 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
15565 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
15566 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
15567 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
15568 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
15569 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
15570 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
15571 ; AVX512-FCP-NEXT: vpermd %zmm24, %zmm3, %zmm8
15572 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
15573 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
15574 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
15575 ; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
15576 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
15577 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
15578 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
15579 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7]
15580 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
15581 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm8
15582 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
15583 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7]
15584 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
15585 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3]
15586 ; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm15
15587 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
15588 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm15
15589 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7]
15590 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
15591 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5
15592 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
15593 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
15594 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
15595 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
15596 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
15597 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13
15598 ; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4
15599 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
15600 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
15601 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
15602 ; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3
15603 ; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
15604 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
15605 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
15606 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
15607 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
15608 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
15609 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
15610 ; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
15611 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
15612 ; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11
15613 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
15614 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7]
15615 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
15616 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
15617 ; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9
15618 ; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
15619 ; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
15620 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
15621 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
15622 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
15623 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload
15624 ; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
15625 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload
15626 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload
15627 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm12, %zmm9, %zmm5
15628 ; AVX512-FCP-NEXT: vpternlogq $184, %zmm13, %zmm9, %zmm11
15629 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7]
15630 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
15631 ; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
15632 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
15633 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
15634 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload
15635 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
15636 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
15637 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
15638 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload
15639 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
15640 ; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
15641 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16
15642 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18
15643 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
15644 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
15645 ; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
15646 ; AVX512-FCP-NEXT: kmovw %eax, %k1
15647 ; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1}
15648 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15649 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
15650 ; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1}
15651 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15652 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload
15653 ; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1}
15654 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15655 ; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
15656 ; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1}
15657 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15658 ; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1}
15659 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
15660 ; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1}
15661 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
15662 ; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1}
15663 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi)
15664 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi)
15665 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
15666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
15667 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
15668 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
15669 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
15670 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
15671 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%r9)
15672 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r9)
15673 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15674 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
15675 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
15676 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
15677 ; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1}
15678 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
15679 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
15680 ; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
15681 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15682 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
15683 ; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
15684 ; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1}
15685 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
15686 ; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708
15687 ; AVX512-FCP-NEXT: vzeroupper
15688 ; AVX512-FCP-NEXT: retq
15690 ; AVX512DQ-LABEL: load_i16_stride7_vf64:
15691 ; AVX512DQ: # %bb.0:
15692 ; AVX512DQ-NEXT: subq $1592, %rsp # imm = 0x638
15693 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm1
15694 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
15695 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
15696 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm30
15697 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29
15698 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
15699 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
15700 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
15701 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
15702 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm3
15703 ; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm4
15704 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
15705 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
15706 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm12
15707 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
15708 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
15709 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
15710 ; AVX512DQ-NEXT: vporq %ymm3, %ymm2, %ymm26
15711 ; AVX512DQ-NEXT: vpbroadcastw 700(%rdi), %xmm3
15712 ; AVX512DQ-NEXT: vmovdqa 672(%rdi), %xmm14
15713 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3]
15714 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
15715 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
15716 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15717 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4
15718 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
15719 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm6
15720 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7
15721 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
15722 ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm8
15723 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
15724 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
15725 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
15726 ; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm11
15727 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6
15728 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
15729 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
15730 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
15731 ; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
15732 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15733 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm9
15734 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13
15735 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm18
15736 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2]
15737 ; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm0
15738 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm15
15739 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3]
15740 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
15741 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
15742 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15743 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
15744 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
15745 ; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm4
15746 ; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm16
15747 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
15748 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15749 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
15750 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
15751 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm21
15752 ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19
15753 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
15754 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
15755 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15756 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
15757 ; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0
15758 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15759 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7]
15760 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5
15761 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
15762 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
15763 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0
15764 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15765 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7]
15766 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
15767 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
15768 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm10
15769 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7]
15770 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
15771 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
15772 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
15773 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15774 ; AVX512DQ-NEXT: vmovdqa 528(%rdi), %xmm7
15775 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm11
15776 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
15777 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
15778 ; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm3
15779 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm0
15780 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm1
15781 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
15782 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
15783 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
15784 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6
15785 ; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3
15786 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15787 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm2
15788 ; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm1
15789 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
15790 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
15791 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6
15792 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
15793 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
15794 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm22
15795 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm22[0,1,0,2]
15796 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
15797 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7]
15798 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
15799 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15800 ; AVX512DQ-NEXT: vmovdqa 688(%rdi), %xmm0
15801 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm14[1],xmm0[2,3,4,5,6,7]
15802 ; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm17
15803 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25
15804 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
15805 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
15806 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
15807 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15808 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
15809 ; AVX512DQ-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15810 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
15811 ; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm14
15812 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15813 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
15814 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
15815 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
15816 ; AVX512DQ-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15817 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
15818 ; AVX512DQ-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15819 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
15820 ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
15821 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
15822 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15823 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm8
15824 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15825 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm6
15826 ; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm3
15827 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15828 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7]
15829 ; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm19
15830 ; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm16
15831 ; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm8
15832 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
15833 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
15834 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm8
15835 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
15836 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2]
15837 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
15838 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15839 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
15840 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
15841 ; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm21
15842 ; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm24
15843 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15844 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,2,3]
15845 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
15846 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3
15847 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15848 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
15849 ; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15850 ; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15851 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
15852 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
15853 ; AVX512DQ-NEXT: vpshufb %ymm7, %ymm8, %ymm7
15854 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm13
15855 ; AVX512DQ-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15856 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm10
15857 ; AVX512DQ-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15858 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7]
15859 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
15860 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
15861 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm5
15862 ; AVX512DQ-NEXT: vpor %ymm7, %ymm5, %ymm3
15863 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15864 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15
15865 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
15866 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9
15867 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm7
15868 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
15869 ; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5
15870 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
15871 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2]
15872 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
15873 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
15874 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15875 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm6
15876 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2
15877 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
15878 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15879 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
15880 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
15881 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
15882 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15883 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
15884 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
15885 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
15886 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
15887 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
15888 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
15889 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm8
15890 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
15891 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7
15892 ; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm0
15893 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15894 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
15895 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
15896 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
15897 ; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6
15898 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
15899 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
15900 ; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm5
15901 ; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm0
15902 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15903 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
15904 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
15905 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
15906 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
15907 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
15908 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm18[0,1,1,3]
15909 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
15910 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5
15911 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
15912 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,1,4,5,6,5]
15913 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
15914 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15915 ; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm6
15916 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8
15917 ; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm7
15918 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
15919 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
15920 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15921 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
15922 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
15923 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
15924 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1
15925 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,1,1,3]
15926 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15927 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,1,2,1,4,5,6,5]
15928 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
15929 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
15930 ; AVX512DQ-NEXT: vpbroadcastw 680(%rdi), %xmm3
15931 ; AVX512DQ-NEXT: vpsrlq $48, %xmm25, %xmm5
15932 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
15933 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
15934 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15935 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
15936 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm12
15937 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
15938 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
15939 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
15940 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
15941 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
15942 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5
15943 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm6
15944 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5],xmm5[6],xmm6[7]
15945 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
15946 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
15947 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
15948 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
15949 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
15950 ; AVX512DQ-NEXT: vpsrld $16, %xmm24, %xmm7
15951 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
15952 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm13
15953 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
15954 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15955 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm11
15956 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
15957 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
15958 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
15959 ; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
15960 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
15961 ; AVX512DQ-NEXT: vmovdqa 656(%rdi), %xmm1
15962 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %xmm3
15963 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm1[6],xmm3[7]
15964 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
15965 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
15966 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
15967 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
15968 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm9
15969 ; AVX512DQ-NEXT: vpsrld $16, %xmm17, %xmm8
15970 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
15971 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7
15972 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15973 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7]
15974 ; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
15975 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
15976 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7]
15977 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
15978 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm7
15979 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
15980 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
15981 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,4,7]
15982 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
15983 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7],ymm7[8,9,10,11,12],ymm6[13,14,15]
15984 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
15985 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
15986 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
15987 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
15988 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15989 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
15990 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6
15991 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
15992 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
15993 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
15994 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
15995 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15996 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,2,1,4,5,6,5]
15997 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
15998 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
15999 ; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0
16000 ; AVX512DQ-NEXT: kmovw %eax, %k1
16001 ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
16002 ; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16003 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
16004 ; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm18
16005 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm25
16006 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6
16007 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
16008 ; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0
16009 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5,6,7]
16010 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16011 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
16012 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
16013 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16014 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
16015 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
16016 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
16017 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
16018 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
16019 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16020 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm1
16021 ; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm2
16022 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
16023 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm6
16024 ; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm7
16025 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
16026 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
16027 ; AVX512DQ-NEXT: vmovdqa 800(%rdi), %ymm3
16028 ; AVX512DQ-NEXT: vmovdqa 768(%rdi), %ymm2
16029 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
16030 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm24
16031 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm5
16032 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
16033 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
16034 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
16035 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
16036 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
16037 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
16038 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
16039 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
16040 ; AVX512DQ-NEXT: vmovdqa 832(%rdi), %ymm3
16041 ; AVX512DQ-NEXT: vmovdqa 864(%rdi), %ymm8
16042 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
16043 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm9
16044 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
16045 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
16046 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
16047 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
16048 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
16049 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16050 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
16051 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22
16052 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
16053 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm20
16054 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
16055 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
16056 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
16057 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
16058 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
16059 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
16060 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16061 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[0,1,2,1,4,5,6,5]
16062 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
16063 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
16064 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16065 ; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
16066 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16067 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2
16068 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm3
16069 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
16070 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11
16071 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
16072 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
16073 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm14
16074 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm0
16075 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm14[3],ymm0[4,5],ymm14[6],ymm0[7]
16076 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
16077 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7,8,9,10,11,12,13],ymm12[14],ymm10[15]
16078 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1
16079 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
16080 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
16081 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7]
16082 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
16083 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm1
16084 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
16085 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
16086 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
16087 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
16088 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
16089 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7]
16090 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
16091 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
16092 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm12[6,7]
16093 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16094 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
16095 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm19
16096 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm15
16097 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12
16098 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
16099 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm0[1],ymm14[2,3],ymm0[4],ymm14[5,6,7]
16100 ; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
16101 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
16102 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
16103 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
16104 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10
16105 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
16106 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm12, %ymm12
16107 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
16108 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
16109 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
16110 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
16111 ; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm4
16112 ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm12, %xmm28
16113 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm28[2,1,2,3]
16114 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
16115 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
16116 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
16117 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
16118 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
16119 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7]
16120 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16121 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
16122 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
16123 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6],xmm11[7]
16124 ; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10
16125 ; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1
16126 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
16127 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
16128 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
16129 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm11, %ymm0
16130 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4,5,6],xmm0[7]
16131 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
16132 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
16133 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
16134 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
16135 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
16136 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
16137 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
16138 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
16139 ; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
16140 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
16141 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16142 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
16143 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
16144 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10
16145 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
16146 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6,7,8],ymm0[9],ymm10[10,11,12,13,14,15]
16147 ; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1
16148 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm13
16149 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7]
16150 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12
16151 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
16152 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
16153 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm10
16154 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
16155 ; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm10
16156 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2
16157 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
16158 ; AVX512DQ-NEXT: vextracti32x4 $1, %ymm0, %xmm28
16159 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
16160 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,2,1]
16161 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
16162 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
16163 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16164 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
16165 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
16166 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16167 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload
16168 ; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
16169 ; AVX512DQ-NEXT: kmovw %eax, %k1
16170 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
16171 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16172 ; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10
16173 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7]
16174 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm11
16175 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
16176 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
16177 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
16178 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm15
16179 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7]
16180 ; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm11
16181 ; AVX512DQ-NEXT: vpternlogq $242, %ymm0, %ymm19, %ymm11
16182 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
16183 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12
16184 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
16185 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
16186 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7]
16187 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
16188 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16189 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
16190 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16191 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload
16192 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
16193 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16194 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
16195 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm11
16196 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7]
16197 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
16198 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
16199 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
16200 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
16201 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16202 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[0,1,2,0,4,5,6,4]
16203 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
16204 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
16205 ; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
16206 ; AVX512DQ-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
16207 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
16208 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
16209 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
16210 ; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm19
16211 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
16212 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
16213 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
16214 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm26
16215 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
16216 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
16217 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
16218 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
16219 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
16220 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
16221 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
16222 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13
16223 ; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0
16224 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
16225 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
16226 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
16227 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
16228 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
16229 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload
16230 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
16231 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0
16232 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2
16233 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7]
16234 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
16235 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
16236 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
16237 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
16238 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16239 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,2,0,4,5,6,4]
16240 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
16241 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
16242 ; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
16243 ; AVX512DQ-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
16244 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
16245 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm29
16246 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
16247 ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm24
16248 ; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm25
16249 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
16250 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
16251 ; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
16252 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
16253 ; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm23
16254 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21
16255 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
16256 ; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm11
16257 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
16258 ; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6
16259 ; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20
16260 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
16261 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
16262 ; AVX512DQ-NEXT: vpor %ymm0, %ymm11, %ymm0
16263 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
16264 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,5,4]
16265 ; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
16266 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
16267 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
16268 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload
16269 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
16270 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16271 ; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16272 ; AVX512DQ-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
16273 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
16274 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm12
16275 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
16276 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7,8,9,10,11,12,13],ymm0[14],ymm12[15]
16277 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
16278 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
16279 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
16280 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm31
16281 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm30
16282 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
16283 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
16284 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
16285 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
16286 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
16287 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2,3,4,5,6,7]
16288 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8
16289 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
16290 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18
16291 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12
16292 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
16293 ; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16294 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
16295 ; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm13
16296 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16297 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
16298 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
16299 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
16300 ; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm4
16301 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
16302 ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm10
16303 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
16304 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
16305 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
16306 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16307 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
16308 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm12, %ymm12
16309 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
16310 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
16311 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm10
16312 ; AVX512DQ-NEXT: vpor %ymm12, %ymm10, %ymm10
16313 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm28
16314 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
16315 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14
16316 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload
16317 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
16318 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16319 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
16320 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
16321 ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0
16322 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm7
16323 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7]
16324 ; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10
16325 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
16326 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7,8,9,10,11,12,13],ymm0[14],ymm10[15]
16327 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
16328 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
16329 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7
16330 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm9
16331 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
16332 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
16333 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
16334 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
16335 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
16336 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
16337 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7]
16338 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16339 ; AVX512DQ-NEXT: vpternlogq $184, %zmm0, %zmm14, %zmm12
16340 ; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm9
16341 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7
16342 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
16343 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10
16344 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
16345 ; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
16346 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm14
16347 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm3
16348 ; AVX512DQ-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16349 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
16350 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm23[0,1,0,1]
16351 ; AVX512DQ-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16352 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15]
16353 ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1
16354 ; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm6
16355 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15
16356 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
16357 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
16358 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
16359 ; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2
16360 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16361 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
16362 ; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
16363 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16364 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
16365 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16366 ; AVX512DQ-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
16367 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
16368 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
16369 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
16370 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13
16371 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
16372 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm20
16373 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
16374 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
16375 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
16376 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
16377 ; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm25
16378 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
16379 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
16380 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
16381 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm0
16382 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
16383 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16384 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
16385 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
16386 ; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
16387 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16388 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
16389 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
16390 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7]
16391 ; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26
16392 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
16393 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
16394 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm5
16395 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm11
16396 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7]
16397 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm18
16398 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm12
16399 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
16400 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
16401 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
16402 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
16403 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
16404 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
16405 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
16406 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3,4,5,6,7]
16407 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
16408 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
16409 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload
16410 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
16411 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
16412 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
16413 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
16414 ; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm0
16415 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7]
16416 ; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm30
16417 ; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm14
16418 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
16419 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
16420 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
16421 ; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm9
16422 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7]
16423 ; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm31
16424 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
16425 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
16426 ; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
16427 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16428 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
16429 ; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
16430 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16431 ; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm6
16432 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
16433 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
16434 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
16435 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
16436 ; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
16437 ; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm10
16438 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm15
16439 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
16440 ; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
16441 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
16442 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
16443 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
16444 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
16445 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
16446 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
16447 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16448 ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm7
16449 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
16450 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
16451 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
16452 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
16453 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
16454 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1
16455 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7]
16456 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
16457 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
16458 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
16459 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
16460 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
16461 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
16462 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
16463 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
16464 ; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
16465 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7]
16466 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
16467 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
16468 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
16469 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
16470 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
16471 ; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
16472 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
16473 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
16474 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
16475 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
16476 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16477 ; AVX512DQ-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16478 ; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
16479 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,3,1]
16480 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
16481 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm4
16482 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
16483 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
16484 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1
16485 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16486 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
16487 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
16488 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload
16489 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
16490 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
16491 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
16492 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15]
16493 ; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
16494 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
16495 ; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
16496 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
16497 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
16498 ; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5
16499 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
16500 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
16501 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
16502 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
16503 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16504 ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm24, %zmm6
16505 ; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1
16506 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
16507 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5
16508 ; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
16509 ; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1
16510 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
16511 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
16512 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
16513 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
16514 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
16515 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
16516 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
16517 ; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
16518 ; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
16519 ; AVX512DQ-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
16520 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1]
16521 ; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
16522 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7]
16523 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16524 ; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
16525 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16526 ; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
16527 ; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm0 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
16528 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16529 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
16530 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
16531 ; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
16532 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
16533 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload
16534 ; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
16535 ; AVX512DQ-NEXT: vpternlogq $184, %zmm1, %zmm0, %zmm2
16536 ; AVX512DQ-NEXT: vpternlogq $184, %zmm4, %zmm0, %zmm3
16537 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rsi)
16538 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16539 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx)
16540 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
16541 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16542 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rcx)
16543 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16544 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx)
16545 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8)
16546 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r8)
16547 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16548 ; AVX512DQ-NEXT: vmovaps %zmm0, (%r9)
16549 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
16550 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9)
16551 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
16552 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax)
16553 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax)
16554 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
16555 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
16556 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax)
16557 ; AVX512DQ-NEXT: addq $1592, %rsp # imm = 0x638
16558 ; AVX512DQ-NEXT: vzeroupper
16559 ; AVX512DQ-NEXT: retq
16561 ; AVX512DQ-FCP-LABEL: load_i16_stride7_vf64:
16562 ; AVX512DQ-FCP: # %bb.0:
16563 ; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8
16564 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm2
16565 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
16566 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0]
16567 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm0
16568 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
16569 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm1
16570 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
16571 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm4
16572 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm18
16573 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm12
16574 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm14
16575 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7]
16576 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
16577 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
16578 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
16579 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6
16580 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
16581 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
16582 ; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm6, %ymm20
16583 ; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
16584 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
16585 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
16586 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm19
16587 ; AVX512DQ-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
16588 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
16589 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
16590 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2]
16591 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16592 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
16593 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
16594 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
16595 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
16596 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8
16597 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
16598 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
16599 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
16600 ; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm3, %ymm31
16601 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
16602 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1
16603 ; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3
16604 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
16605 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16606 ; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm13
16607 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
16608 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
16609 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
16610 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm15
16611 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
16612 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
16613 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
16614 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
16615 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
16616 ; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
16617 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16618 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
16619 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
16620 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
16621 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
16622 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
16623 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
16624 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
16625 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
16626 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
16627 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
16628 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
16629 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
16630 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
16631 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm5[1],xmm13[2,3,4,5,6,7]
16632 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm27
16633 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
16634 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm10
16635 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
16636 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16637 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
16638 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm23
16639 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
16640 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
16641 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
16642 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm2
16643 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
16644 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2
16645 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16646 ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
16647 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
16648 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
16649 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
16650 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
16651 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
16652 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7]
16653 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm22
16654 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
16655 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2]
16656 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
16657 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
16658 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
16659 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7]
16660 ; AVX512DQ-FCP-NEXT: vmovdqa 688(%rdi), %xmm14
16661 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1
16662 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16663 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7]
16664 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7
16665 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,9,0,13,0,0,0]
16666 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
16667 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16668 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10
16669 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16670 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7]
16671 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm2
16672 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16673 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
16674 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
16675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm28
16676 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16677 ; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm4, %zmm8
16678 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
16679 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
16680 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
16681 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
16682 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
16683 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16684 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm3
16685 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm11
16686 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
16687 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
16688 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
16689 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
16690 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
16691 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25
16692 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
16693 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [2,5,2,5,2,5,2,5]
16694 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm16, %ymm13
16695 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
16696 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6],ymm13[7]
16697 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm0
16698 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
16699 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
16700 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
16701 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm15
16702 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm0
16703 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16704 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
16705 ; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16706 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16707 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
16708 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
16709 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
16710 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9
16711 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm4
16712 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16713 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
16714 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm4
16715 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16716 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
16717 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
16718 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
16719 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm21
16720 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
16721 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7]
16722 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm7
16723 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
16724 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
16725 ; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm16, %ymm7
16726 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
16727 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
16728 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
16729 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm30
16730 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm8
16731 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm1
16732 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16733 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
16734 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8
16735 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
16736 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15]
16737 ; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9
16738 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
16739 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9
16740 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
16741 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
16742 ; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm1
16743 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16744 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7]
16745 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9
16746 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
16747 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
16748 ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
16749 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15
16750 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
16751 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,1,1,3]
16752 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14
16753 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
16754 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
16755 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
16756 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm23
16757 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7]
16758 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm14
16759 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3],xmm14[4,5,6,7]
16760 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
16761 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm13
16762 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8
16763 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
16764 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16765 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm12
16766 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6,7]
16767 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
16768 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
16769 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
16770 ; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
16771 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
16772 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,3]
16773 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm9
16774 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
16775 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
16776 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm29
16777 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm3[1],ymm11[2,3,4],ymm3[5],ymm11[6,7]
16778 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
16779 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
16780 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
16781 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
16782 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16783 ; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
16784 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
16785 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
16786 ; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm4
16787 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5
16788 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm7
16789 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
16790 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm28
16791 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
16792 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
16793 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21
16794 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13]
16795 ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm27, %zmm3
16796 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
16797 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
16798 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16799 ; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0
16800 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
16801 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm20 {%k1}
16802 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16803 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7]
16804 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
16805 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
16806 ; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
16807 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16808 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1
16809 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
16810 ; AVX512DQ-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1
16811 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4
16812 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm30, %xmm2
16813 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
16814 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
16815 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
16816 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm1
16817 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
16818 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm3
16819 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
16820 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
16821 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm6
16822 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
16823 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
16824 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7],ymm6[8,9,10,11,12],ymm3[13,14,15]
16825 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
16826 ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm19, %xmm6
16827 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
16828 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3
16829 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16830 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm6
16831 ; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm10
16832 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7]
16833 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
16834 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
16835 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
16836 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7]
16837 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
16838 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
16839 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30
16840 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,4,7,11,14]
16841 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm14
16842 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
16843 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm14
16844 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3,4,5,6],xmm14[7]
16845 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
16846 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
16847 ; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8
16848 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm27, %zmm14
16849 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm9
16850 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7]
16851 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 {%k1} # 16-byte Folded Reload
16852 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16853 ; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm0
16854 ; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
16855 ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm8
16856 ; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm8
16857 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
16858 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
16859 ; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm9, %xmm8
16860 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
16861 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
16862 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm0
16863 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16864 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0]
16865 ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
16866 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm7, %ymm0
16867 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
16868 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13]
16869 ; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm2
16870 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
16871 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
16872 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
16873 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
16874 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
16875 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
16876 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
16877 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16878 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
16879 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm2
16880 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
16881 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
16882 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16883 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16884 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
16885 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm15
16886 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm0[2,3],ymm15[4,5],ymm0[6,7]
16887 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
16888 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14
16889 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7]
16890 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
16891 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
16892 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm14
16893 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6
16894 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3,4,5,6],xmm6[7]
16895 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
16896 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
16897 ; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm1, %zmm1
16898 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
16899 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm1[6,7]
16900 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16901 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
16902 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11
16903 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
16904 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
16905 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
16906 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
16907 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
16908 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm10
16909 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
16910 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm10
16911 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7]
16912 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
16913 ; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm8, %zmm10
16914 ; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
16915 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm10
16916 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7]
16917 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16918 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm5
16919 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
16920 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7]
16921 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10
16922 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6],xmm10[7]
16923 ; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
16924 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm6
16925 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
16926 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
16927 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
16928 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm3
16929 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
16930 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
16931 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16932 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
16933 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16
16934 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
16935 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
16936 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
16937 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
16938 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
16939 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm6
16940 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
16941 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
16942 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
16943 ; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm27, %zmm6
16944 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
16945 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
16946 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
16947 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16948 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
16949 ; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
16950 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
16951 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
16952 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16953 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7]
16954 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
16955 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
16956 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
16957 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
16958 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2
16959 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
16960 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
16961 ; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm27, %zmm0
16962 ; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
16963 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16964 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16965 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
16966 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
16967 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16968 ; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm0
16969 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
16970 ; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm8, %zmm1
16971 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
16972 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
16973 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
16974 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
16975 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
16976 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
16977 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
16978 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16979 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
16980 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
16981 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
16982 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm11
16983 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm8
16984 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
16985 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
16986 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7
16987 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7]
16988 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
16989 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
16990 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
16991 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
16992 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16993 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9]
16994 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm9, %zmm6
16995 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
16996 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
16997 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
16998 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
16999 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
17000 ; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm2, %ymm2
17001 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
17002 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload
17003 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1}
17004 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17005 ; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm1
17006 ; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %ymm13
17007 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7]
17008 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm31
17009 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
17010 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
17011 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
17012 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm2
17013 ; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
17014 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
17015 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
17016 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm22
17017 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9
17018 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7]
17019 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
17020 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
17021 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
17022 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
17023 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
17024 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload
17025 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
17026 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17027 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
17028 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
17029 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
17030 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
17031 ; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2
17032 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm17
17033 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
17034 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
17035 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
17036 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0]
17037 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
17038 ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3
17039 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
17040 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
17041 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5
17042 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
17043 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm28
17044 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
17045 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12
17046 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19
17047 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
17048 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
17049 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7]
17050 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm11
17051 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
17052 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
17053 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
17054 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
17055 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
17056 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0]
17057 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm2
17058 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
17059 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2
17060 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
17061 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
17062 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
17063 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
17064 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
17065 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1}
17066 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
17067 ; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
17068 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7]
17069 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
17070 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
17071 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2
17072 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
17073 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
17074 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
17075 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17076 ; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm21, %zmm2
17077 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
17078 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29
17079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
17080 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm0, %zmm25, %zmm26
17081 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5
17082 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
17083 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
17084 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
17085 ; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
17086 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm2
17087 ; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
17088 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
17089 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7
17090 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7]
17091 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
17092 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
17093 ; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
17094 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
17095 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
17096 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
17097 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
17098 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm26 {%k1}
17099 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
17100 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm23
17101 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31
17102 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
17103 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
17104 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
17105 ; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
17106 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
17107 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
17108 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0]
17109 ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm22, %zmm3
17110 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
17111 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
17112 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
17113 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
17114 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
17115 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
17116 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
17117 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
17118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
17119 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm19
17120 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
17121 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
17122 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
17123 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
17124 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
17125 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0]
17126 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm1
17127 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
17128 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
17129 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
17130 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
17131 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10
17132 ; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1
17133 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
17134 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
17135 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
17136 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm10
17137 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
17138 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
17139 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
17140 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
17141 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
17142 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
17143 ; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm22, %zmm3
17144 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
17145 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm3[2,3,4,5,6,7]
17146 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7]
17147 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20
17148 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
17149 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
17150 ; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
17151 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm3
17152 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
17153 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
17154 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
17155 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
17156 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
17157 ; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
17158 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
17159 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
17160 ; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
17161 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17162 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload
17163 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
17164 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
17165 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2
17166 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
17167 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
17168 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
17169 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
17170 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
17171 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
17172 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0]
17173 ; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm5, %zmm2
17174 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm15
17175 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
17176 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7]
17177 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm8
17178 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
17179 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
17180 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
17181 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
17182 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7]
17183 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
17184 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6
17185 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
17186 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
17187 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
17188 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3]
17189 ; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm8
17190 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
17191 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
17192 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
17193 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
17194 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
17195 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
17196 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
17197 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
17198 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
17199 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
17200 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
17201 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
17202 ; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
17203 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
17204 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
17205 ; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm5, %zmm4
17206 ; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4
17207 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
17208 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
17209 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
17210 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7]
17211 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
17212 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7]
17213 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm11
17214 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7]
17215 ; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
17216 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm8
17217 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
17218 ; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
17219 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
17220 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
17221 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
17222 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
17223 ; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm7
17224 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm2
17225 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
17226 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
17227 ; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
17228 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
17229 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
17230 ; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
17231 ; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
17232 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17233 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
17234 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17235 ; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
17236 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
17237 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
17238 ; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
17239 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm3, %zmm2, %zmm10
17240 ; AVX512DQ-FCP-NEXT: vpternlogq $184, %zmm4, %zmm2, %zmm8
17241 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
17242 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
17243 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rdx)
17244 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
17245 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17246 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
17247 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17248 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
17249 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17250 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8)
17251 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17252 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r8)
17253 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9)
17254 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r9)
17255 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17256 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax)
17257 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
17258 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17259 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
17260 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax)
17261 ; AVX512DQ-FCP-NEXT: addq $1240, %rsp # imm = 0x4D8
17262 ; AVX512DQ-FCP-NEXT: vzeroupper
17263 ; AVX512DQ-FCP-NEXT: retq
17265 ; AVX512BW-LABEL: load_i16_stride7_vf64:
17266 ; AVX512BW: # %bb.0:
17267 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
17268 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
17269 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm0
17270 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2
17271 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm5
17272 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm3
17273 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6
17274 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm7
17275 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm4
17276 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13
17277 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm15
17278 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm10
17279 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm11
17280 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
17281 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm14
17282 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm12
17283 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
17284 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
17285 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm17
17286 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
17287 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
17288 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9
17289 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
17290 ; AVX512BW-NEXT: movw $992, %di # imm = 0x3E0
17291 ; AVX512BW-NEXT: kmovd %edi, %k2
17292 ; AVX512BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
17293 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
17294 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17295 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm18
17296 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
17297 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
17298 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
17299 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
17300 ; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
17301 ; AVX512BW-NEXT: kmovd %edi, %k1
17302 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
17303 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
17304 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
17305 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
17306 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
17307 ; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
17308 ; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
17309 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
17310 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
17311 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm19
17312 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
17313 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
17314 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17315 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
17316 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
17317 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm22
17318 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
17319 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
17320 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17321 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm16
17322 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
17323 ; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
17324 ; AVX512BW-NEXT: kmovd %edi, %k2
17325 ; AVX512BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
17326 ; AVX512BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
17327 ; AVX512BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
17328 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
17329 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
17330 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
17331 ; AVX512BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
17332 ; AVX512BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
17333 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
17334 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17335 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm21
17336 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
17337 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
17338 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17339 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
17340 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
17341 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
17342 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24
17343 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
17344 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
17345 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19
17346 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
17347 ; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
17348 ; AVX512BW-NEXT: kmovd %edi, %k1
17349 ; AVX512BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
17350 ; AVX512BW-NEXT: movw $-512, %di # imm = 0xFE00
17351 ; AVX512BW-NEXT: kmovd %edi, %k2
17352 ; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
17353 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
17354 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
17355 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
17356 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
17357 ; AVX512BW-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
17358 ; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
17359 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
17360 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17361 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm23
17362 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
17363 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
17364 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17365 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
17366 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
17367 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
17368 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm26
17369 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
17370 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
17371 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm21
17372 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
17373 ; AVX512BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
17374 ; AVX512BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
17375 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
17376 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
17377 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
17378 ; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
17379 ; AVX512BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
17380 ; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
17381 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
17382 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17383 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
17384 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17385 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
17386 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17387 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17388 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
17389 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17390 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm28
17391 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
17392 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
17393 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm23
17394 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
17395 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
17396 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
17397 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17398 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17399 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
17400 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
17401 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
17402 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
17403 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
17404 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17405 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm25
17406 ; AVX512BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17407 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
17408 ; AVX512BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17409 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17410 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
17411 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17412 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm28
17413 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
17414 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
17415 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30
17416 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
17417 ; AVX512BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
17418 ; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
17419 ; AVX512BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17420 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17421 ; AVX512BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
17422 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
17423 ; AVX512BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
17424 ; AVX512BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
17425 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
17426 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17427 ; AVX512BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
17428 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
17429 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
17430 ; AVX512BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
17431 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
17432 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
17433 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
17434 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
17435 ; AVX512BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
17436 ; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
17437 ; AVX512BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
17438 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
17439 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
17440 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
17441 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
17442 ; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
17443 ; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
17444 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%rsi)
17445 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rsi)
17446 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
17447 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx)
17448 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rcx)
17449 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
17450 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%r8)
17451 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%r8)
17452 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r9)
17453 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r9)
17454 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%r10)
17455 ; AVX512BW-NEXT: vmovdqa64 %zmm30, (%r10)
17456 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
17457 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
17458 ; AVX512BW-NEXT: vzeroupper
17459 ; AVX512BW-NEXT: retq
17461 ; AVX512BW-FCP-LABEL: load_i16_stride7_vf64:
17462 ; AVX512BW-FCP: # %bb.0:
17463 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17464 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
17465 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0
17466 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
17467 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5
17468 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3
17469 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
17470 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm7
17471 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4
17472 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
17473 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
17474 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10
17475 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11
17476 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
17477 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14
17478 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12
17479 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
17480 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
17481 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17
17482 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
17483 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
17484 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
17485 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
17486 ; AVX512BW-FCP-NEXT: movw $992, %di # imm = 0x3E0
17487 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
17488 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
17489 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
17490 ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17491 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18
17492 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
17493 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
17494 ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
17495 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
17496 ; AVX512BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000
17497 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
17498 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
17499 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
17500 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
17501 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
17502 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
17503 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
17504 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
17505 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
17506 ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
17507 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19
17508 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
17509 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
17510 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17511 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
17512 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
17513 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22
17514 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
17515 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
17516 ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17517 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
17518 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
17519 ; AVX512BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF
17520 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
17521 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
17522 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
17523 ; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
17524 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
17525 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
17526 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
17527 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
17528 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
17529 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
17530 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17531 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21
17532 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
17533 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
17534 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17535 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
17536 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
17537 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
17538 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
17539 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
17540 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
17541 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19
17542 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
17543 ; AVX512BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00
17544 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
17545 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
17546 ; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
17547 ; AVX512BW-FCP-NEXT: kmovd %edi, %k2
17548 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
17549 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
17550 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
17551 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
17552 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
17553 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
17554 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
17555 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
17556 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17557 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
17558 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
17559 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
17560 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17561 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
17562 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
17563 ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
17564 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
17565 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
17566 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
17567 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21
17568 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
17569 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
17570 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
17571 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
17572 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
17573 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
17574 ; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
17575 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
17576 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
17577 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
17578 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17579 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
17580 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17581 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
17582 ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17583 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17584 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
17585 ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17586 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28
17587 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
17588 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
17589 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23
17590 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
17591 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
17592 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
17593 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17594 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17595 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
17596 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
17597 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
17598 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
17599 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
17600 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17601 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
17602 ; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17603 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
17604 ; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17605 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17606 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
17607 ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17608 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28
17609 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
17610 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
17611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
17612 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
17613 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
17614 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
17615 ; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17616 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17617 ; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
17618 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
17619 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
17620 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
17621 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
17622 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17623 ; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
17624 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
17625 ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
17626 ; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
17627 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
17628 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
17629 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
17630 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
17631 ; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
17632 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
17633 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
17634 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
17635 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
17636 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
17637 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
17638 ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
17639 ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
17640 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi)
17641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
17642 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
17643 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
17644 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx)
17645 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
17646 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8)
17647 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
17648 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9)
17649 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9)
17650 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r10)
17651 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, (%r10)
17652 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
17653 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
17654 ; AVX512BW-FCP-NEXT: vzeroupper
17655 ; AVX512BW-FCP-NEXT: retq
17657 ; AVX512DQ-BW-LABEL: load_i16_stride7_vf64:
17658 ; AVX512DQ-BW: # %bb.0:
17659 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
17660 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
17661 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm0
17662 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2
17663 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm5
17664 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm3
17665 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6
17666 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm7
17667 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm4
17668 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13
17669 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm15
17670 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm10
17671 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm11
17672 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
17673 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm14
17674 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm12
17675 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
17676 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
17677 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm17
17678 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
17679 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
17680 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9
17681 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
17682 ; AVX512DQ-BW-NEXT: movw $992, %di # imm = 0x3E0
17683 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
17684 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
17685 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
17686 ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17687 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm18
17688 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
17689 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
17690 ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
17691 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
17692 ; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
17693 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
17694 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
17695 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
17696 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
17697 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
17698 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
17699 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
17700 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
17701 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
17702 ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
17703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm19
17704 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
17705 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
17706 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17707 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
17708 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
17709 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm22
17710 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
17711 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
17712 ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17713 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm16
17714 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
17715 ; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
17716 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
17717 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
17718 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
17719 ; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
17720 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
17721 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
17722 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
17723 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
17724 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
17725 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
17726 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17727 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm21
17728 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
17729 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
17730 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17731 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
17732 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
17733 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
17734 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24
17735 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
17736 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
17737 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19
17738 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
17739 ; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
17740 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
17741 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
17742 ; AVX512DQ-BW-NEXT: movw $-512, %di # imm = 0xFE00
17743 ; AVX512DQ-BW-NEXT: kmovd %edi, %k2
17744 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
17745 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
17746 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
17747 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
17748 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
17749 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
17750 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
17751 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
17752 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17753 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm23
17754 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
17755 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
17756 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17757 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
17758 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
17759 ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
17760 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm26
17761 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
17762 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
17763 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm21
17764 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
17765 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
17766 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
17767 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
17768 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
17769 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
17770 ; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
17771 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
17772 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
17773 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
17774 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17775 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25
17776 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17777 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
17778 ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17779 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17780 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
17781 ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17782 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm28
17783 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
17784 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
17785 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm23
17786 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
17787 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
17788 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
17789 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17790 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17791 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
17792 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
17793 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
17794 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
17795 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
17796 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17797 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm25
17798 ; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17799 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
17800 ; AVX512DQ-BW-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17801 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17802 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
17803 ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17804 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm28
17805 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
17806 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
17807 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30
17808 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
17809 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
17810 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
17811 ; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17812 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17813 ; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
17814 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
17815 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
17816 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
17817 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
17818 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17819 ; AVX512DQ-BW-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
17820 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
17821 ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
17822 ; AVX512DQ-BW-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
17823 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
17824 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
17825 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
17826 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
17827 ; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
17828 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
17829 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
17830 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
17831 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
17832 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
17833 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
17834 ; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
17835 ; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
17836 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%rsi)
17837 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rsi)
17838 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
17839 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rdx)
17840 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rcx)
17841 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx)
17842 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%r8)
17843 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%r8)
17844 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r9)
17845 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r9)
17846 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%r10)
17847 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, (%r10)
17848 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
17849 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
17850 ; AVX512DQ-BW-NEXT: vzeroupper
17851 ; AVX512DQ-BW-NEXT: retq
17853 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride7_vf64:
17854 ; AVX512DQ-BW-FCP: # %bb.0:
17855 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17856 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
17857 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm0
17858 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
17859 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5
17860 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm3
17861 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
17862 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm7
17863 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm4
17864 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
17865 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm15
17866 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm10
17867 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm11
17868 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
17869 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14
17870 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm12
17871 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
17872 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
17873 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm17
17874 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm16, %zmm17
17875 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0]
17876 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
17877 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm8, %zmm9
17878 ; AVX512DQ-BW-FCP-NEXT: movw $992, %di # imm = 0x3E0
17879 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
17880 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm9 {%k2}
17881 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0]
17882 ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17883 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm18
17884 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm17, %zmm18
17885 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57,0,0,0,19,20,21,22,23,24,25,26,27,36,43,50,57]
17886 ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
17887 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm19, %zmm18
17888 ; AVX512DQ-BW-FCP-NEXT: movl $-524288, %edi # imm = 0xFFF80000
17889 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
17890 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm9 {%k1}
17891 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm17
17892 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm19, %zmm17
17893 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm16
17894 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm8
17895 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm8 {%k2}
17896 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm8 {%k1}
17897 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0]
17898 ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
17899 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm19
17900 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm18, %zmm19
17901 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
17902 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17903 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm20, %zmm19
17904 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0]
17905 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm22
17906 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm21, %zmm22
17907 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
17908 ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
17909 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
17910 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm17, %zmm16
17911 ; AVX512DQ-BW-FCP-NEXT: movl $511, %edi # imm = 0x1FF
17912 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
17913 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm16 {%k2}
17914 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm16 {%k1}
17915 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm4, %zmm18
17916 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm20, %zmm18
17917 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm17
17918 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm21
17919 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm17 {%k2}
17920 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm18, %zmm17 {%k1}
17921 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0]
17922 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
17923 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm21
17924 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm20, %zmm21
17925 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59,0,0,18,19,20,21,22,23,24,25,26,27,38,45,52,59]
17926 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17927 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm22, %zmm21
17928 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
17929 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
17930 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
17931 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm23, %zmm24
17932 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0]
17933 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19
17934 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm18, %zmm19
17935 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %edi # imm = 0x3FE00
17936 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
17937 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm24, %zmm19 {%k1}
17938 ; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
17939 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
17940 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm19 {%k2}
17941 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm20
17942 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm22, %zmm20
17943 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm23
17944 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm18
17945 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm18 {%k1}
17946 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm18 {%k2}
17947 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm22 = [0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0]
17948 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3]
17949 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm23
17950 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm22, %zmm23
17951 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60,0,0,18,19,20,21,22,23,24,25,26,32,39,46,53,60]
17952 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17953 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm24, %zmm23
17954 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
17955 ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
17956 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
17957 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm25, %zmm26
17958 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0]
17959 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm21
17960 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm20, %zmm21
17961 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm26, %zmm21 {%k1}
17962 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm21 {%k2}
17963 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm22
17964 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm24, %zmm22
17965 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm25
17966 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm20
17967 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm20 {%k1}
17968 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm22, %zmm20 {%k2}
17969 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0]
17970 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17971 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
17972 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17973 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61,0,0,18,19,20,21,22,23,24,25,26,33,40,47,54,61]
17974 ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17975 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17976 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
17977 ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
17978 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm28
17979 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm27, %zmm28
17980 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0]
17981 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm23
17982 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm22, %zmm23
17983 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm23 {%k1}
17984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm23 {%k2}
17985 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
17986 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
17987 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm5, %zmm27
17988 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm22
17989 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm22 {%k1}
17990 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm22 {%k2}
17991 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0]
17992 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
17993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
17994 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm24, %zmm25
17995 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62,0,0,18,19,20,21,22,23,24,25,26,34,41,48,55,62]
17996 ; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
17997 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm26, %zmm25
17998 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
17999 ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
18000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm28
18001 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm27, %zmm28
18002 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0]
18003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
18004 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm29, %zmm30
18005 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm28, %zmm30 {%k1}
18006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm25, %zmm30 {%k2}
18007 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm7, %zmm24
18008 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm26, %zmm24
18009 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm3, %zmm27
18010 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm29
18011 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm27, %zmm29 {%k1}
18012 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm29 {%k2}
18013 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0]
18014 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
18015 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm24, %zmm10
18016 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63,0,0,18,19,20,21,22,23,24,25,26,35,42,49,56,63]
18017 ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
18018 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm15, %zmm10
18019 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
18020 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
18021 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm12
18022 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0]
18023 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm14, %zmm1
18024 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm1 {%k1}
18025 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k2}
18026 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm24, %zmm4
18027 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm15, %zmm4
18028 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm3
18029 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm14, %zmm0
18030 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1}
18031 ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k2}
18032 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rsi)
18033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
18034 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
18035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
18036 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rcx)
18037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
18038 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%r8)
18039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
18040 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r9)
18041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r9)
18042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%r10)
18043 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, (%r10)
18044 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
18045 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
18046 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
18047 ; AVX512DQ-BW-FCP-NEXT: retq
18048 %wide.vec = load <448 x i16>, ptr %in.vec, align 64
18049 %strided.vec0 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
18050 %strided.vec1 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
18051 %strided.vec2 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
18052 %strided.vec3 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
18053 %strided.vec4 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
18054 %strided.vec5 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
18055 %strided.vec6 = shufflevector <448 x i16> %wide.vec, <448 x i16> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
18056 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
18057 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
18058 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
18059 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
18060 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
18061 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
18062 store <64 x i16> %strided.vec6, ptr %out.vec6, align 64